nu-modules/ms-ocr.nu

58 lines
1.7 KiB
Plaintext

use std assert
def comp-authority [] {
[
{ value: "http://10.54.150.152:5000", description: "internal ms ocr"},
]
}
export def main [pdf:path, --authority:string@comp-authority] {
let authority = if $authority == $nothing {
assert ("MS_OCR_AUTHORITY" in $env) $"no authority specified. Use either an environment or spezify --host-name"
$env.MS_OCR_AUTHORITY
} else {
$authority
}
http post -H [content-type, application/octet-stream] $"($authority)/vision/v3.2/read/syncAnalyze?language=de&readingOrder=natural" (open $pdf)
}
# list<str> => [{filename: str, ocr: table}]
export def batch [] {
each { |image_path|
print $"ms ocr processing of (ansi gb)($image_path | path basename)(ansi reset)"
if ($image_path | path exists) {
let result = (main $image_path)
{filename: $image_path, ocr: $result}
}
}
}
# get the pages from ocr
export def "slice pages" [...page_numbers:int] {
let ocr = ($in)
let slice = ($ocr | get analyzeResult.readResults | enumerate
| where { |x| $x.index in $page_numbers }
| get item
| wrap readResults
)
$ocr | update analyzeResult $slice
}
export def create-ocr-cache [directory:string=""] {
mkdir ocr_cache
glob $"([$directory, *{pdf,png,jpg,jpeg,tif,tiff}] | path join)" | batch | each { |x| $x.ocr | save --force $"ocr_cache/($x.filename | path basename)-ms-ocr.json" }
}
export def "to-words" [] {
get analyzeResult.readResults.lines | flatten | get words | flatten | get text
}
export def "to-lines" [] {
get analyzeResult.readResults.lines | flatten | get text
}
export def "to-text" [] {
to-lines | str join '. '
}