starting with fileprocessor2

3
Timo Bryant 2024-01-04 11:56:20 +01:00
parent 9ea725fc36
commit 2cab145008
15 changed files with 110 additions and 31 deletions

View File

@ -1,12 +1,15 @@
package de.itkl.assetmanager package de.itkl.assetmanager
import de.itkl.assetmanager.implementation.AssetsFileProcessorBackend
import de.itkl.assetmanager.implementation.FilesystemAssetManager import de.itkl.assetmanager.implementation.FilesystemAssetManager
import de.itkl.assetmanager.implementation.FilesystemProjectManager import de.itkl.assetmanager.implementation.FilesystemProjectManager
import de.itkl.assetmanager.interfaces.AssetManager import de.itkl.assetmanager.interfaces.AssetManager
import de.itkl.assetmanager.interfaces.ProjectManager import de.itkl.assetmanager.interfaces.ProjectManager
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
import org.koin.dsl.module import org.koin.dsl.module
val assetManagerModule = module { val assetManagerModule = module {
single<ProjectManager> { FilesystemProjectManager() } single<ProjectManager> { FilesystemProjectManager() }
single<AssetManager> { FilesystemAssetManager() } single<AssetManager> { FilesystemAssetManager() }
single<FileProcessorBackend> { AssetsFileProcessorBackend() }
} }

View File

@ -0,0 +1,22 @@
package de.itkl.assetmanager.implementation
import de.itkl.core_api.interfaces.FileProcessor2
import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.assets.Assets
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
private val Log = KotlinLogging.logger { }
class AssetsFileProcessorBackend : FileProcessorBackend, KoinComponent {
override suspend fun process(resource: Resource, assets: Assets, fileProcessor: FileProcessor2) {
Log.debug { "Call processor '${fileProcessor.filename}' on $resource" }
if (assets.exists(fileProcessor.filename)) {
Log.info { "${fileProcessor.filename} already exists on ${resource}. Skipping" }
} else {
Log.info { "${fileProcessor.filename} does not yet exists for $resource" }
val newResource = fileProcessor.process(resource)
assets.store(newResource)
}
}
}

View File

@ -1,7 +1,7 @@
package de.itkl.assetmanager.implementation package de.itkl.assetmanager.implementation
import de.itkl.assetmanager.interfaces.AssetManager import de.itkl.assetmanager.interfaces.AssetManager
import de.itkl.assetmanager.interfaces.Assets import de.itkl.core_api.interfaces.assets.Assets
import de.itkl.core_api.interfaces.Resource import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.ResourceFactory import de.itkl.core_api.interfaces.ResourceFactory
import io.github.oshai.kotlinlogging.KotlinLogging import io.github.oshai.kotlinlogging.KotlinLogging

View File

@ -1,7 +1,7 @@
package de.itkl.assetmanager.implementation package de.itkl.assetmanager.implementation
import de.itkl.assetmanager.interfaces.AssetManager import de.itkl.assetmanager.interfaces.AssetManager
import de.itkl.assetmanager.interfaces.Assets import de.itkl.core_api.interfaces.assets.Assets
import de.itkl.assetmanager.interfaces.Project import de.itkl.assetmanager.interfaces.Project
import de.itkl.assetmanager.interfaces.ProjectManager import de.itkl.assetmanager.interfaces.ProjectManager
import de.itkl.core_api.interfaces.Resource import de.itkl.core_api.interfaces.Resource

View File

@ -1,5 +1,7 @@
package de.itkl.assetmanager.interfaces package de.itkl.assetmanager.interfaces
import de.itkl.core_api.interfaces.assets.Assets
/** /**
* Manage the assets for one document * Manage the assets for one document
*/ */

View File

@ -1,10 +0,0 @@
package de.itkl.assetmanager.interfaces
import de.itkl.core_api.interfaces.Resource
import kotlinx.coroutines.flow.Flow
interface Assets : Flow<Resource> {
suspend fun store(resource: Resource)
suspend fun retrieve(name: String): Resource?
suspend fun delete(name: String)
}

View File

@ -1,6 +1,7 @@
package de.itkl.assetmanager.interfaces package de.itkl.assetmanager.interfaces
import de.itkl.core_api.interfaces.Resource import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.assets.Assets
/** /**
* A set of documents. Each can hold its own assets * A set of documents. Each can hold its own assets

View File

@ -0,0 +1,34 @@
package de.itkl.core_api.implementation
import de.itkl.core_api.interfaces.Resource
import io.ktor.http.*
import kotlinx.serialization.*
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.encodeToStream
import java.io.File
import java.io.InputStream
import java.io.UnsupportedEncodingException
import java.nio.file.Path
class SerializableResource<T : Any> @OptIn(ExperimentalSerializationApi::class) constructor(
override val filename: String,
override val contentType: ContentType,
private val obj: T,
private val serializer: SerializationStrategy<T>
) : Resource {
override val length: Long? = null
override val file: File? = null
override val path: Path? = null
override fun read(): InputStream {
return serialize().byteInputStream()
}
private fun serialize(): String {
return when(contentType) {
ContentType.Application.Json -> Json.encodeToString(serializer, obj)
else -> throw UnsupportedEncodingException("Sorry but $contentType is not supported for Resources")
}
}
}

View File

@ -2,8 +2,14 @@ package de.itkl.core_api.interfaces
import java.io.File import java.io.File
import java.nio.file.Path import java.nio.file.Path
import java.util.function.Consumer
interface FileProcessor { interface FileProcessor {
fun willProduce(path: Path): Path fun willProduce(path: Path): Path
suspend fun process(resource: Resource): File suspend fun process(resource: Resource): File
}
interface FileProcessor2 {
val filename: String
suspend fun process(resource: Resource): Resource
} }

View File

@ -23,6 +23,7 @@ interface Resource {
fun <T: Any> json(deserializer: DeserializationStrategy<T>): T { fun <T: Any> json(deserializer: DeserializationStrategy<T>): T {
return Json.decodeFromString(deserializer, read().readAllBytes().contentToString()) return Json.decodeFromString(deserializer, read().readAllBytes().contentToString())
} }
} }
@ -36,4 +37,8 @@ abstract class AbstractResource : Resource, KoinComponent {
final override fun read(): InputStream { final override fun read(): InputStream {
return doRead() return doRead()
} }
override fun toString(): String {
return filename
}
} }

View File

@ -2,6 +2,9 @@ package de.itkl.core_api.interfaces
import de.itkl.core_api.implementation.FileResource import de.itkl.core_api.implementation.FileResource
import de.itkl.core_api.implementation.ProgressResource import de.itkl.core_api.implementation.ProgressResource
import de.itkl.core_api.implementation.SerializableResource
import io.ktor.http.*
import kotlinx.serialization.SerializationStrategy
import org.koin.core.component.KoinComponent import org.koin.core.component.KoinComponent
import org.koin.core.component.inject import org.koin.core.component.inject
import java.io.File import java.io.File
@ -11,7 +14,13 @@ import java.nio.file.Paths
class ResourceFactory : KoinComponent { class ResourceFactory : KoinComponent {
private val progressBarFactory by inject<ProgressBarFactory>() private val progressBarFactory by inject<ProgressBarFactory>()
fun <T : Any> json(name: String, obj: T, serializationStrategy: SerializationStrategy<T>): Resource {
return SerializableResource<T>(
filename = name,
contentType = ContentType.Application.Json,
obj = obj,
serializer = serializationStrategy)
}
fun file(path: String): Resource { fun file(path: String): Resource {
return file(Paths.get(path)) return file(Paths.get(path))
} }

View File

@ -1,7 +1,8 @@
package de.itkl.core_api.interfaces.data package de.itkl.core_api.interfaces.data
import de.itkl.core_api.interfaces.FileProcessor import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.FileProcessor2
interface Processable { interface Processable {
suspend fun process(fileProcessor: FileProcessor) suspend fun process(fileProcessor: FileProcessor2)
} }

View File

@ -2,12 +2,15 @@ package de.itkl.httpClient.clients
import de.itkl.core_api.dtos.MsOcrResponse import de.itkl.core_api.dtos.MsOcrResponse
import de.itkl.core_api.interfaces.FileProcessor import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.FileProcessor2
import de.itkl.core_api.interfaces.Resource import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.ResourceFactory
import io.github.oshai.kotlinlogging.KotlinLogging import io.github.oshai.kotlinlogging.KotlinLogging
import io.ktor.client.* import io.ktor.client.*
import io.ktor.client.call.* import io.ktor.client.call.*
import io.ktor.client.request.* import io.ktor.client.request.*
import io.ktor.client.statement.* import io.ktor.client.statement.*
import io.ktor.client.utils.EmptyContent.contentType
import io.ktor.http.* import io.ktor.http.*
import kotlinx.serialization.json.Json import kotlinx.serialization.json.Json
import org.koin.core.component.KoinComponent import org.koin.core.component.KoinComponent
@ -18,8 +21,9 @@ import kotlin.io.path.nameWithoutExtension
import kotlin.io.path.writeText import kotlin.io.path.writeText
private val Log = KotlinLogging.logger { } private val Log = KotlinLogging.logger { }
class MsOcr: KoinComponent, FileProcessor { class MsOcr: KoinComponent, FileProcessor2 {
private val httpClient: HttpClient by inject() private val httpClient: HttpClient by inject()
private val resourceFactory: ResourceFactory by inject()
suspend fun ocr(resource: Resource): MsOcrResponse { suspend fun ocr(resource: Resource): MsOcrResponse {
val response = httpClient.post { val response = httpClient.post {
@ -34,15 +38,10 @@ class MsOcr: KoinComponent, FileProcessor {
return response.body() return response.body()
} }
override fun willProduce(path: Path): Path { override val filename = "ms-ocr.json"
return path.parent.resolve(path.nameWithoutExtension + ".ms-ocr.json")
}
override suspend fun process(resource: Resource): File { override suspend fun process(resource: Resource): Resource {
val result = ocr(resource) val result = ocr(resource)
val jsonString = Json.encodeToString(MsOcrResponse.serializer(), result) return resourceFactory.json(filename, result, MsOcrResponse.serializer())
val destination = willProduce(resource.path!!)
destination.writeText(jsonString)
return destination.toFile()
} }
} }

View File

@ -22,15 +22,12 @@ class CorpusFactory : KoinComponent {
} }
} }
} }
class Corpus(private val project: Project): Processable, KoinComponent { class Corpus(private val project: Project): KoinComponent {
val displayName get() = project.displayName val displayName get() = project.displayName
val documentNames get() = project.documentNames val documentNames get() = project.documentNames
private val resourceFactory: ResourceFactory by inject() private val resourceFactory: ResourceFactory by inject()
override suspend fun process(fileProcessor: FileProcessor) {
TODO("NEXT")
}
suspend fun document(name: String): Document { suspend fun document(name: String): Document {
return Document(name, listOf(project.resource(name)!!)) return Document(name, listOf(project.resource(name)!!))
} }

View File

@ -3,7 +3,10 @@ package de.itkl.textprocessing
import de.itkl.assetmanager.interfaces.AssetManager import de.itkl.assetmanager.interfaces.AssetManager
import de.itkl.core_api.dtos.MsOcrResponse import de.itkl.core_api.dtos.MsOcrResponse
import de.itkl.core_api.interfaces.FileProcessor import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.FileProcessor2
import de.itkl.core_api.interfaces.Resource import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.assets.Assets
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
import de.itkl.core_api.interfaces.data.Processable import de.itkl.core_api.interfaces.data.Processable
import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.asFlow import kotlinx.coroutines.flow.asFlow
@ -18,6 +21,10 @@ class Document(
val resources: List<Resource> val resources: List<Resource>
) : Processable, KoinComponent { ) : Processable, KoinComponent {
private val assetManager: AssetManager by inject() private val assetManager: AssetManager by inject()
private val fileProcessorBackend: FileProcessorBackend by inject()
suspend fun assets(): Assets {
return assetManager.assets(name)
}
/** /**
* Loads the extracted ocr pages. Note that not every pages * Loads the extracted ocr pages. Note that not every pages
@ -25,7 +32,7 @@ class Document(
*/ */
suspend fun retrieveOcrPages(): List<OcrPage> { suspend fun retrieveOcrPages(): List<OcrPage> {
// TODO: How to identify the assets independently from their name? // TODO: How to identify the assets independently from their name?
val resource = checkNotNull(assetManager.assets(name) val resource = checkNotNull(assets()
.retrieve("ms-ocr")) { .retrieve("ms-ocr")) {
"Ocr for $name is not yet created" "Ocr for $name is not yet created"
} }
@ -33,9 +40,12 @@ class Document(
return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) } return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) }
} }
override suspend fun process(fileProcessor: FileProcessor) { override suspend fun process(fileProcessor: FileProcessor2) {
// TODO: Rework the whole fileprocessor. Should work indepently from a pipeline fileProcessorBackend.process(
fileProcessor.process(resources.first()) resources.first(),
assets(),
fileProcessor
)
} }
private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage { private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage {