58 lines
1.7 KiB
Plaintext
58 lines
1.7 KiB
Plaintext
use std assert
|
|
|
|
def comp-authority [] {
|
|
[
|
|
{ value: "http://10.54.150.152:5000", description: "internal ms ocr"},
|
|
]
|
|
}
|
|
|
|
export def main [pdf:path, --authority:string@comp-authority] {
|
|
let authority = if $authority == $nothing {
|
|
assert ("MS_OCR_AUTHORITY" in $env) $"no authority specified. Use either an environment or spezify --host-name"
|
|
$env.MS_OCR_AUTHORITY
|
|
} else {
|
|
$authority
|
|
}
|
|
|
|
http post -H [content-type, application/octet-stream] $"($authority)/vision/v3.2/read/syncAnalyze?language=de&readingOrder=natural" (open $pdf)
|
|
}
|
|
|
|
# list<str> => [{filename: str, ocr: table}]
|
|
export def batch [] {
|
|
each { |image_path|
|
|
print $"ms ocr processing of (ansi gb)($image_path | path basename)(ansi reset)"
|
|
if ($image_path | path exists) {
|
|
let result = (main $image_path)
|
|
{filename: $image_path, ocr: $result}
|
|
}
|
|
}
|
|
}
|
|
|
|
# get the pages from ocr
|
|
export def "slice pages" [...page_numbers:int] {
|
|
let ocr = ($in)
|
|
let slice = ($ocr | get analyzeResult.readResults | enumerate
|
|
| where { |x| $x.index in $page_numbers }
|
|
| get item
|
|
| wrap readResults
|
|
)
|
|
$ocr | update analyzeResult $slice
|
|
}
|
|
|
|
export def create-ocr-cache [directory:string=""] {
|
|
mkdir ocr_cache
|
|
glob $"([$directory, *{pdf,png,jpg,jpeg,tif,tiff}] | path join)" | batch | each { |x| $x.ocr | save --force $"ocr_cache/($x.filename | path basename)-ms-ocr.json" }
|
|
}
|
|
|
|
|
|
export def "to-words" [] {
|
|
get analyzeResult.readResults.lines | flatten | get words | flatten | get text
|
|
}
|
|
|
|
export def "to-lines" [] {
|
|
get analyzeResult.readResults.lines | flatten | get text
|
|
}
|
|
|
|
export def "to-text" [] {
|
|
to-lines | str join '. '
|
|
} |