starting with document

3
Timo Bryant 2024-01-03 13:17:41 +01:00
parent 7ed5a39bac
commit accdfbca67
7 changed files with 85 additions and 10 deletions

View File

@ -1,3 +1,7 @@
plugins {
kotlin("plugin.serialization") version embeddedKotlinVersion
}
dependencies {
// used for contentType
api("io.ktor:ktor-http-jvm:2.3.7")

View File

@ -1,8 +1,7 @@
package de.itkl.httpClient.clients
package de.itkl.core_api.dtos
import kotlinx.datetime.Instant
import kotlinx.datetime.LocalDateTime
import kotlinx.serialization.SerialName
import kotlinx.serialization.Serializable

View File

@ -1,11 +1,15 @@
package de.itkl.core_api.interfaces
import io.ktor.http.*
import kotlinx.serialization.DeserializationStrategy
import kotlinx.serialization.KSerializer
import kotlinx.serialization.json.Json
import org.koin.core.component.KoinComponent
import org.koin.core.component.get
import java.io.File
import java.io.InputStream
import java.nio.file.Path
import kotlin.reflect.KClass
interface Resource {
val filename: String
@ -16,7 +20,12 @@ interface Resource {
val path: Path?
fun read(): InputStream
fun <T: Any> json(deserializer: DeserializationStrategy<T>): T {
return Json.decodeFromString(deserializer, read().readAllBytes().contentToString())
}
}
/**
* Automatically adds koin injectable decorators to reading/writing

View File

@ -0,0 +1,7 @@
package de.itkl.core_api.interfaces.data
import de.itkl.core_api.interfaces.FileProcessor
interface Processable {
suspend fun process(fileProcessor: FileProcessor)
}

View File

@ -1,5 +1,6 @@
package de.itkl.httpClient.clients
import de.itkl.core_api.dtos.MsOcrResponse
import de.itkl.core_api.interfaces.Resource
import io.github.oshai.kotlinlogging.KotlinLogging
import io.ktor.client.*

View File

@ -3,6 +3,7 @@ package de.itkl.textprocessing
import de.itkl.assetmanager.interfaces.Project
import de.itkl.assetmanager.interfaces.ProjectManager
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.data.Processable
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import org.koin.java.KoinJavaComponent.inject
@ -14,11 +15,11 @@ class CorpusFactory : KoinComponent {
return Corpus(projectManager.load(name))
}
}
class Corpus(private val project: Project) : KoinComponent {
class Corpus(private val project: Project): Processable {
val displayName get() = project.displayName
val documentNames get() = project.documentNames
suspend fun process(fileProcessor: FileProcessor) {
override suspend fun process(fileProcessor: FileProcessor) {
TODO("NEXT")
}
suspend fun document(name: String): Document {

View File

@ -1,33 +1,87 @@
package de.itkl.textprocessing
import de.itkl.assetmanager.interfaces.AssetManager
import de.itkl.core_api.dtos.MsOcrResponse
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.data.Processable
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.asFlow
import kotlinx.coroutines.flow.filter
import me.piruin.geok.BBox
import me.piruin.geok.LatLng
import me.piruin.geok.geometry.Polygon
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
class Document(
val name: String,
val resources: List<Resource>
) {
) : Processable, KoinComponent {
private val assetManager: AssetManager by inject()
/**
* Loads the extracted ocr pages. Note that not every pages
* needs to have ocr
*/
suspend fun retrieveOcrPages(): List<OcrPage> {
// TODO: How to identify the assets independently from their name?
val resource = checkNotNull(assetManager.assets(name)
.retrieve("ms-ocr")) {
"Ocr for $name is not yet created"
}
val msOcrResponse = resource.json(MsOcrResponse.serializer())
return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) }
}
override suspend fun process(fileProcessor: FileProcessor) {
// TODO: Rework the whole fileprocessor. Should work indepently from a pipeline
fileProcessor.process(resources.first())
}
private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage {
return OcrPage(
pageNumber = readResult.page,
width = readResult.width,
height = readResult.height,
words = readResult.lines.flatMap { line -> line.words.map { toOcrWord(it) } }
)
}
private fun toOcrWord(word: MsOcrResponse.AnalyzeResult.ReadResult.Line.Word): OcrPage.OcrWord {
val box = word.boundingBox
return OcrPage.OcrWord(
polygon = Polygon(listOf(
LatLng(box[0].toDouble(), box[1].toDouble()),
LatLng(box[2].toDouble(), box[3].toDouble()),
LatLng(box[4].toDouble(), box[5].toDouble()),
LatLng(box[6].toDouble(), box[7].toDouble()),
)),
text = word.text
)
}
}
class OcrPage(
val words: List<Word>,
val regions: List<DocumentRegion>
val width: Int,
val height: Int,
val pageNumber: Int,
val words: List<OcrWord>,
val regions: List<DocumentRegion> = emptyList()
) {
inner class DocumentRegion(
private val polygon: Polygon,
private val type: String,
) {
fun words(): Flow<Word> {
fun words(): Flow<OcrWord> {
return words
.asFlow()
.filter { word -> word.polygon.intersectionWith(polygon) != null }
}
}
inner class Word(
fun addOcrWord(polygon: Polygon, text: String): OcrWord {
return OcrWord(polygon, text)
}
class OcrWord(
val polygon: Polygon,
val text: String
)