use std assert def comp-authority [] { [ { value: "http://10.54.150.152:5000", description: "internal ms ocr"}, ] } export def main [pdf:path, --authority:string@comp-authority] { let authority = if $authority == $nothing { assert ("MS_OCR_AUTHORITY" in $env) $"no authority specified. Use either an environment or spezify --host-name" $env.MS_OCR_AUTHORITY } else { $authority } http post -H [content-type, application/octet-stream] $"($authority)/vision/v3.2/read/syncAnalyze?language=de&readingOrder=natural" (open $pdf) } # list => [{filename: str, ocr: table}] export def batch [] { each { |image_path| print $"ms ocr processing of (ansi gb)($image_path | path basename)(ansi reset)" if ($image_path | path exists) { let result = (main $image_path) {filename: $image_path, ocr: $result} } } } # get the pages from ocr export def "slice pages" [...page_numbers:int] { let ocr = ($in) let slice = ($ocr | get analyzeResult.readResults | enumerate | where { |x| $x.index in $page_numbers } | get item | wrap readResults ) $ocr | update analyzeResult $slice } export def create-ocr-cache [directory:string=""] { mkdir ocr_cache glob $"([$directory, *{pdf,png,jpg,jpeg,tif,tiff}] | path join)" | batch | each { |x| $x.ocr | save --force $"ocr_cache/($x.filename | path basename)-ms-ocr.json" } } export def "to-words" [] { get analyzeResult.readResults.lines | flatten | get words | flatten | get text } export def "to-lines" [] { get analyzeResult.readResults.lines | flatten | get text } export def "to-text" [] { to-lines | str join '. ' }