starting with document
parent
7ed5a39bac
commit
accdfbca67
|
|
@ -1,3 +1,7 @@
|
||||||
|
plugins {
|
||||||
|
kotlin("plugin.serialization") version embeddedKotlinVersion
|
||||||
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
// used for contentType
|
// used for contentType
|
||||||
api("io.ktor:ktor-http-jvm:2.3.7")
|
api("io.ktor:ktor-http-jvm:2.3.7")
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,7 @@
|
||||||
package de.itkl.httpClient.clients
|
package de.itkl.core_api.dtos
|
||||||
|
|
||||||
|
|
||||||
import kotlinx.datetime.Instant
|
import kotlinx.datetime.Instant
|
||||||
import kotlinx.datetime.LocalDateTime
|
|
||||||
import kotlinx.serialization.SerialName
|
import kotlinx.serialization.SerialName
|
||||||
import kotlinx.serialization.Serializable
|
import kotlinx.serialization.Serializable
|
||||||
|
|
||||||
|
|
@ -1,11 +1,15 @@
|
||||||
package de.itkl.core_api.interfaces
|
package de.itkl.core_api.interfaces
|
||||||
|
|
||||||
import io.ktor.http.*
|
import io.ktor.http.*
|
||||||
|
import kotlinx.serialization.DeserializationStrategy
|
||||||
|
import kotlinx.serialization.KSerializer
|
||||||
|
import kotlinx.serialization.json.Json
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
import org.koin.core.component.get
|
import org.koin.core.component.get
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.io.InputStream
|
import java.io.InputStream
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
import kotlin.reflect.KClass
|
||||||
|
|
||||||
interface Resource {
|
interface Resource {
|
||||||
val filename: String
|
val filename: String
|
||||||
|
|
@ -16,8 +20,13 @@ interface Resource {
|
||||||
val path: Path?
|
val path: Path?
|
||||||
fun read(): InputStream
|
fun read(): InputStream
|
||||||
|
|
||||||
|
fun <T: Any> json(deserializer: DeserializationStrategy<T>): T {
|
||||||
|
return Json.decodeFromString(deserializer, read().readAllBytes().contentToString())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Automatically adds koin injectable decorators to reading/writing
|
* Automatically adds koin injectable decorators to reading/writing
|
||||||
* operations
|
* operations
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
package de.itkl.core_api.interfaces.data
|
||||||
|
|
||||||
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
|
|
||||||
|
interface Processable {
|
||||||
|
suspend fun process(fileProcessor: FileProcessor)
|
||||||
|
}
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
package de.itkl.httpClient.clients
|
package de.itkl.httpClient.clients
|
||||||
|
|
||||||
|
import de.itkl.core_api.dtos.MsOcrResponse
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.core_api.interfaces.Resource
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
import io.ktor.client.*
|
import io.ktor.client.*
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package de.itkl.textprocessing
|
||||||
import de.itkl.assetmanager.interfaces.Project
|
import de.itkl.assetmanager.interfaces.Project
|
||||||
import de.itkl.assetmanager.interfaces.ProjectManager
|
import de.itkl.assetmanager.interfaces.ProjectManager
|
||||||
import de.itkl.core_api.interfaces.FileProcessor
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
|
import de.itkl.core_api.interfaces.data.Processable
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
import org.koin.core.component.inject
|
import org.koin.core.component.inject
|
||||||
import org.koin.java.KoinJavaComponent.inject
|
import org.koin.java.KoinJavaComponent.inject
|
||||||
|
|
@ -14,11 +15,11 @@ class CorpusFactory : KoinComponent {
|
||||||
return Corpus(projectManager.load(name))
|
return Corpus(projectManager.load(name))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
class Corpus(private val project: Project) : KoinComponent {
|
class Corpus(private val project: Project): Processable {
|
||||||
val displayName get() = project.displayName
|
val displayName get() = project.displayName
|
||||||
val documentNames get() = project.documentNames
|
val documentNames get() = project.documentNames
|
||||||
|
|
||||||
suspend fun process(fileProcessor: FileProcessor) {
|
override suspend fun process(fileProcessor: FileProcessor) {
|
||||||
TODO("NEXT")
|
TODO("NEXT")
|
||||||
}
|
}
|
||||||
suspend fun document(name: String): Document {
|
suspend fun document(name: String): Document {
|
||||||
|
|
|
||||||
|
|
@ -1,33 +1,87 @@
|
||||||
package de.itkl.textprocessing
|
package de.itkl.textprocessing
|
||||||
|
|
||||||
|
import de.itkl.assetmanager.interfaces.AssetManager
|
||||||
|
import de.itkl.core_api.dtos.MsOcrResponse
|
||||||
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
import de.itkl.core_api.interfaces.data.Processable
|
||||||
import kotlinx.coroutines.flow.Flow
|
import kotlinx.coroutines.flow.Flow
|
||||||
import kotlinx.coroutines.flow.asFlow
|
import kotlinx.coroutines.flow.asFlow
|
||||||
import kotlinx.coroutines.flow.filter
|
import kotlinx.coroutines.flow.filter
|
||||||
import me.piruin.geok.BBox
|
import me.piruin.geok.LatLng
|
||||||
import me.piruin.geok.geometry.Polygon
|
import me.piruin.geok.geometry.Polygon
|
||||||
|
import org.koin.core.component.KoinComponent
|
||||||
|
import org.koin.core.component.inject
|
||||||
|
|
||||||
class Document(
|
class Document(
|
||||||
val name: String,
|
val name: String,
|
||||||
val resources: List<Resource>
|
val resources: List<Resource>
|
||||||
) {
|
) : Processable, KoinComponent {
|
||||||
|
private val assetManager: AssetManager by inject()
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads the extracted ocr pages. Note that not every pages
|
||||||
|
* needs to have ocr
|
||||||
|
*/
|
||||||
|
suspend fun retrieveOcrPages(): List<OcrPage> {
|
||||||
|
// TODO: How to identify the assets independently from their name?
|
||||||
|
val resource = checkNotNull(assetManager.assets(name)
|
||||||
|
.retrieve("ms-ocr")) {
|
||||||
|
"Ocr for $name is not yet created"
|
||||||
|
}
|
||||||
|
val msOcrResponse = resource.json(MsOcrResponse.serializer())
|
||||||
|
|
||||||
|
return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) }
|
||||||
|
}
|
||||||
|
override suspend fun process(fileProcessor: FileProcessor) {
|
||||||
|
// TODO: Rework the whole fileprocessor. Should work indepently from a pipeline
|
||||||
|
fileProcessor.process(resources.first())
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage {
|
||||||
|
return OcrPage(
|
||||||
|
pageNumber = readResult.page,
|
||||||
|
width = readResult.width,
|
||||||
|
height = readResult.height,
|
||||||
|
words = readResult.lines.flatMap { line -> line.words.map { toOcrWord(it) } }
|
||||||
|
)
|
||||||
|
}
|
||||||
|
private fun toOcrWord(word: MsOcrResponse.AnalyzeResult.ReadResult.Line.Word): OcrPage.OcrWord {
|
||||||
|
val box = word.boundingBox
|
||||||
|
return OcrPage.OcrWord(
|
||||||
|
polygon = Polygon(listOf(
|
||||||
|
LatLng(box[0].toDouble(), box[1].toDouble()),
|
||||||
|
LatLng(box[2].toDouble(), box[3].toDouble()),
|
||||||
|
LatLng(box[4].toDouble(), box[5].toDouble()),
|
||||||
|
LatLng(box[6].toDouble(), box[7].toDouble()),
|
||||||
|
)),
|
||||||
|
text = word.text
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class OcrPage(
|
class OcrPage(
|
||||||
val words: List<Word>,
|
val width: Int,
|
||||||
val regions: List<DocumentRegion>
|
val height: Int,
|
||||||
|
val pageNumber: Int,
|
||||||
|
val words: List<OcrWord>,
|
||||||
|
val regions: List<DocumentRegion> = emptyList()
|
||||||
) {
|
) {
|
||||||
inner class DocumentRegion(
|
inner class DocumentRegion(
|
||||||
private val polygon: Polygon,
|
private val polygon: Polygon,
|
||||||
private val type: String,
|
private val type: String,
|
||||||
) {
|
) {
|
||||||
fun words(): Flow<Word> {
|
fun words(): Flow<OcrWord> {
|
||||||
return words
|
return words
|
||||||
.asFlow()
|
.asFlow()
|
||||||
.filter { word -> word.polygon.intersectionWith(polygon) != null }
|
.filter { word -> word.polygon.intersectionWith(polygon) != null }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
inner class Word(
|
|
||||||
|
fun addOcrWord(polygon: Polygon, text: String): OcrWord {
|
||||||
|
return OcrWord(polygon, text)
|
||||||
|
}
|
||||||
|
class OcrWord(
|
||||||
val polygon: Polygon,
|
val polygon: Polygon,
|
||||||
val text: String
|
val text: String
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue