starting with document
parent
7ed5a39bac
commit
accdfbca67
|
|
@ -1,3 +1,7 @@
|
|||
plugins {
|
||||
kotlin("plugin.serialization") version embeddedKotlinVersion
|
||||
}
|
||||
|
||||
dependencies {
|
||||
// used for contentType
|
||||
api("io.ktor:ktor-http-jvm:2.3.7")
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
package de.itkl.httpClient.clients
|
||||
package de.itkl.core_api.dtos
|
||||
|
||||
|
||||
import kotlinx.datetime.Instant
|
||||
import kotlinx.datetime.LocalDateTime
|
||||
import kotlinx.serialization.SerialName
|
||||
import kotlinx.serialization.Serializable
|
||||
|
||||
|
|
@ -1,11 +1,15 @@
|
|||
package de.itkl.core_api.interfaces
|
||||
|
||||
import io.ktor.http.*
|
||||
import kotlinx.serialization.DeserializationStrategy
|
||||
import kotlinx.serialization.KSerializer
|
||||
import kotlinx.serialization.json.Json
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.get
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Path
|
||||
import kotlin.reflect.KClass
|
||||
|
||||
interface Resource {
|
||||
val filename: String
|
||||
|
|
@ -16,7 +20,12 @@ interface Resource {
|
|||
val path: Path?
|
||||
fun read(): InputStream
|
||||
|
||||
fun <T: Any> json(deserializer: DeserializationStrategy<T>): T {
|
||||
return Json.decodeFromString(deserializer, read().readAllBytes().contentToString())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Automatically adds koin injectable decorators to reading/writing
|
||||
|
|
|
|||
|
|
@ -0,0 +1,7 @@
|
|||
package de.itkl.core_api.interfaces.data
|
||||
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
|
||||
interface Processable {
|
||||
suspend fun process(fileProcessor: FileProcessor)
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
package de.itkl.httpClient.clients
|
||||
|
||||
import de.itkl.core_api.dtos.MsOcrResponse
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import io.ktor.client.*
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package de.itkl.textprocessing
|
|||
import de.itkl.assetmanager.interfaces.Project
|
||||
import de.itkl.assetmanager.interfaces.ProjectManager
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.data.Processable
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
import org.koin.java.KoinJavaComponent.inject
|
||||
|
|
@ -14,11 +15,11 @@ class CorpusFactory : KoinComponent {
|
|||
return Corpus(projectManager.load(name))
|
||||
}
|
||||
}
|
||||
class Corpus(private val project: Project) : KoinComponent {
|
||||
class Corpus(private val project: Project): Processable {
|
||||
val displayName get() = project.displayName
|
||||
val documentNames get() = project.documentNames
|
||||
|
||||
suspend fun process(fileProcessor: FileProcessor) {
|
||||
override suspend fun process(fileProcessor: FileProcessor) {
|
||||
TODO("NEXT")
|
||||
}
|
||||
suspend fun document(name: String): Document {
|
||||
|
|
|
|||
|
|
@ -1,33 +1,87 @@
|
|||
package de.itkl.textprocessing
|
||||
|
||||
import de.itkl.assetmanager.interfaces.AssetManager
|
||||
import de.itkl.core_api.dtos.MsOcrResponse
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.core_api.interfaces.data.Processable
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
import kotlinx.coroutines.flow.asFlow
|
||||
import kotlinx.coroutines.flow.filter
|
||||
import me.piruin.geok.BBox
|
||||
import me.piruin.geok.LatLng
|
||||
import me.piruin.geok.geometry.Polygon
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
|
||||
class Document(
|
||||
val name: String,
|
||||
val resources: List<Resource>
|
||||
) {
|
||||
) : Processable, KoinComponent {
|
||||
private val assetManager: AssetManager by inject()
|
||||
|
||||
/**
|
||||
* Loads the extracted ocr pages. Note that not every pages
|
||||
* needs to have ocr
|
||||
*/
|
||||
suspend fun retrieveOcrPages(): List<OcrPage> {
|
||||
// TODO: How to identify the assets independently from their name?
|
||||
val resource = checkNotNull(assetManager.assets(name)
|
||||
.retrieve("ms-ocr")) {
|
||||
"Ocr for $name is not yet created"
|
||||
}
|
||||
val msOcrResponse = resource.json(MsOcrResponse.serializer())
|
||||
|
||||
return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) }
|
||||
}
|
||||
override suspend fun process(fileProcessor: FileProcessor) {
|
||||
// TODO: Rework the whole fileprocessor. Should work indepently from a pipeline
|
||||
fileProcessor.process(resources.first())
|
||||
}
|
||||
|
||||
private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage {
|
||||
return OcrPage(
|
||||
pageNumber = readResult.page,
|
||||
width = readResult.width,
|
||||
height = readResult.height,
|
||||
words = readResult.lines.flatMap { line -> line.words.map { toOcrWord(it) } }
|
||||
)
|
||||
}
|
||||
private fun toOcrWord(word: MsOcrResponse.AnalyzeResult.ReadResult.Line.Word): OcrPage.OcrWord {
|
||||
val box = word.boundingBox
|
||||
return OcrPage.OcrWord(
|
||||
polygon = Polygon(listOf(
|
||||
LatLng(box[0].toDouble(), box[1].toDouble()),
|
||||
LatLng(box[2].toDouble(), box[3].toDouble()),
|
||||
LatLng(box[4].toDouble(), box[5].toDouble()),
|
||||
LatLng(box[6].toDouble(), box[7].toDouble()),
|
||||
)),
|
||||
text = word.text
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
class OcrPage(
|
||||
val words: List<Word>,
|
||||
val regions: List<DocumentRegion>
|
||||
val width: Int,
|
||||
val height: Int,
|
||||
val pageNumber: Int,
|
||||
val words: List<OcrWord>,
|
||||
val regions: List<DocumentRegion> = emptyList()
|
||||
) {
|
||||
inner class DocumentRegion(
|
||||
private val polygon: Polygon,
|
||||
private val type: String,
|
||||
) {
|
||||
fun words(): Flow<Word> {
|
||||
fun words(): Flow<OcrWord> {
|
||||
return words
|
||||
.asFlow()
|
||||
.filter { word -> word.polygon.intersectionWith(polygon) != null }
|
||||
}
|
||||
}
|
||||
inner class Word(
|
||||
|
||||
fun addOcrWord(polygon: Polygon, text: String): OcrWord {
|
||||
return OcrWord(polygon, text)
|
||||
}
|
||||
class OcrWord(
|
||||
val polygon: Polygon,
|
||||
val text: String
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in New Issue