diff --git a/libraries/core-api/build.gradle.kts b/libraries/core-api/build.gradle.kts index 7bf4c6c..03290c8 100644 --- a/libraries/core-api/build.gradle.kts +++ b/libraries/core-api/build.gradle.kts @@ -1,3 +1,7 @@ +plugins { + kotlin("plugin.serialization") version embeddedKotlinVersion +} + dependencies { // used for contentType api("io.ktor:ktor-http-jvm:2.3.7") diff --git a/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcrResponse.kt b/libraries/core-api/src/main/kotlin/de/itkl/core_api/dtos/MsOcrResponse.kt similarity index 97% rename from libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcrResponse.kt rename to libraries/core-api/src/main/kotlin/de/itkl/core_api/dtos/MsOcrResponse.kt index 835cad6..8bfb1b0 100644 --- a/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcrResponse.kt +++ b/libraries/core-api/src/main/kotlin/de/itkl/core_api/dtos/MsOcrResponse.kt @@ -1,8 +1,7 @@ -package de.itkl.httpClient.clients +package de.itkl.core_api.dtos import kotlinx.datetime.Instant -import kotlinx.datetime.LocalDateTime import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable diff --git a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/Resource.kt b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/Resource.kt index bde4f94..52c51f0 100644 --- a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/Resource.kt +++ b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/Resource.kt @@ -1,11 +1,15 @@ package de.itkl.core_api.interfaces import io.ktor.http.* +import kotlinx.serialization.DeserializationStrategy +import kotlinx.serialization.KSerializer +import kotlinx.serialization.json.Json import org.koin.core.component.KoinComponent import org.koin.core.component.get import java.io.File import java.io.InputStream import java.nio.file.Path +import kotlin.reflect.KClass interface Resource { val filename: String @@ -16,8 +20,13 @@ interface Resource { val path: Path? fun read(): InputStream + fun json(deserializer: DeserializationStrategy): T { + return Json.decodeFromString(deserializer, read().readAllBytes().contentToString()) + } } + + /** * Automatically adds koin injectable decorators to reading/writing * operations diff --git a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/data/Processable.kt b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/data/Processable.kt new file mode 100644 index 0000000..7db75c3 --- /dev/null +++ b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/data/Processable.kt @@ -0,0 +1,7 @@ +package de.itkl.core_api.interfaces.data + +import de.itkl.core_api.interfaces.FileProcessor + +interface Processable { + suspend fun process(fileProcessor: FileProcessor) +} \ No newline at end of file diff --git a/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcr.kt b/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcr.kt index 0ea6c06..1f1233d 100644 --- a/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcr.kt +++ b/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcr.kt @@ -1,5 +1,6 @@ package de.itkl.httpClient.clients +import de.itkl.core_api.dtos.MsOcrResponse import de.itkl.core_api.interfaces.Resource import io.github.oshai.kotlinlogging.KotlinLogging import io.ktor.client.* diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Corpus.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Corpus.kt index 1c008ae..62de6cc 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Corpus.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Corpus.kt @@ -3,6 +3,7 @@ package de.itkl.textprocessing import de.itkl.assetmanager.interfaces.Project import de.itkl.assetmanager.interfaces.ProjectManager import de.itkl.core_api.interfaces.FileProcessor +import de.itkl.core_api.interfaces.data.Processable import org.koin.core.component.KoinComponent import org.koin.core.component.inject import org.koin.java.KoinJavaComponent.inject @@ -14,11 +15,11 @@ class CorpusFactory : KoinComponent { return Corpus(projectManager.load(name)) } } -class Corpus(private val project: Project) : KoinComponent { +class Corpus(private val project: Project): Processable { val displayName get() = project.displayName val documentNames get() = project.documentNames - suspend fun process(fileProcessor: FileProcessor) { + override suspend fun process(fileProcessor: FileProcessor) { TODO("NEXT") } suspend fun document(name: String): Document { diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentContainer.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentContainer.kt index 6d28352..e0f6bfc 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentContainer.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentContainer.kt @@ -1,33 +1,87 @@ package de.itkl.textprocessing +import de.itkl.assetmanager.interfaces.AssetManager +import de.itkl.core_api.dtos.MsOcrResponse +import de.itkl.core_api.interfaces.FileProcessor import de.itkl.core_api.interfaces.Resource +import de.itkl.core_api.interfaces.data.Processable import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.asFlow import kotlinx.coroutines.flow.filter -import me.piruin.geok.BBox +import me.piruin.geok.LatLng import me.piruin.geok.geometry.Polygon +import org.koin.core.component.KoinComponent +import org.koin.core.component.inject class Document( val name: String, val resources: List -) { +) : Processable, KoinComponent { + private val assetManager: AssetManager by inject() + + /** + * Loads the extracted ocr pages. Note that not every pages + * needs to have ocr + */ + suspend fun retrieveOcrPages(): List { + // TODO: How to identify the assets independently from their name? + val resource = checkNotNull(assetManager.assets(name) + .retrieve("ms-ocr")) { + "Ocr for $name is not yet created" + } + val msOcrResponse = resource.json(MsOcrResponse.serializer()) + + return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) } + } + override suspend fun process(fileProcessor: FileProcessor) { + // TODO: Rework the whole fileprocessor. Should work indepently from a pipeline + fileProcessor.process(resources.first()) + } + + private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage { + return OcrPage( + pageNumber = readResult.page, + width = readResult.width, + height = readResult.height, + words = readResult.lines.flatMap { line -> line.words.map { toOcrWord(it) } } + ) + } + private fun toOcrWord(word: MsOcrResponse.AnalyzeResult.ReadResult.Line.Word): OcrPage.OcrWord { + val box = word.boundingBox + return OcrPage.OcrWord( + polygon = Polygon(listOf( + LatLng(box[0].toDouble(), box[1].toDouble()), + LatLng(box[2].toDouble(), box[3].toDouble()), + LatLng(box[4].toDouble(), box[5].toDouble()), + LatLng(box[6].toDouble(), box[7].toDouble()), + )), + text = word.text + ) + } } class OcrPage( - val words: List, - val regions: List + val width: Int, + val height: Int, + val pageNumber: Int, + val words: List, + val regions: List = emptyList() ) { inner class DocumentRegion( private val polygon: Polygon, private val type: String, ) { - fun words(): Flow { + fun words(): Flow { return words .asFlow() .filter { word -> word.polygon.intersectionWith(polygon) != null } } } - inner class Word( + + fun addOcrWord(polygon: Polygon, text: String): OcrWord { + return OcrWord(polygon, text) + } + class OcrWord( val polygon: Polygon, val text: String )