From 2cab145008b2ddc5992d56de489713d67883429d Mon Sep 17 00:00:00 2001 From: Timo Bryant Date: Thu, 4 Jan 2024 11:56:20 +0100 Subject: [PATCH] starting with fileprocessor2 --- .../itkl/assetmanager/assetManagerModule.kt | 3 ++ .../AssetsFileProcessorBackend.kt | 22 ++++++++++++ .../implementation/FilesystemAssetManager.kt | 2 +- .../FilesystemProjectManager.kt | 2 +- .../assetmanager/interfaces/AssetManager.kt | 2 ++ .../de/itkl/assetmanager/interfaces/Assets.kt | 10 ------ .../itkl/assetmanager/interfaces/Project.kt | 1 + .../implementation/SerializableResource.kt | 34 +++++++++++++++++++ .../itkl/core_api/interfaces/FileProcessor.kt | 6 ++++ .../de/itkl/core_api/interfaces/Resource.kt | 5 +++ .../core_api/interfaces/ResourceFactory.kt | 11 +++++- .../core_api/interfaces/data/Processable.kt | 3 +- .../de/itkl/httpClient/clients/MsOcr.kt | 17 +++++----- .../kotlin/de/itkl/textprocessing/Corpus.kt | 5 +-- .../{DocumentContainer.kt => Document.kt} | 18 +++++++--- 15 files changed, 110 insertions(+), 31 deletions(-) create mode 100644 libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/AssetsFileProcessorBackend.kt delete mode 100644 libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/Assets.kt create mode 100644 libraries/core-api/src/main/kotlin/de/itkl/core_api/implementation/SerializableResource.kt rename libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/{DocumentContainer.kt => Document.kt} (82%) diff --git a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/assetManagerModule.kt b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/assetManagerModule.kt index 5d31d49..8b29bdd 100644 --- a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/assetManagerModule.kt +++ b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/assetManagerModule.kt @@ -1,12 +1,15 @@ package de.itkl.assetmanager +import de.itkl.assetmanager.implementation.AssetsFileProcessorBackend import de.itkl.assetmanager.implementation.FilesystemAssetManager import de.itkl.assetmanager.implementation.FilesystemProjectManager import de.itkl.assetmanager.interfaces.AssetManager import de.itkl.assetmanager.interfaces.ProjectManager +import de.itkl.core_api.interfaces.assets.FileProcessorBackend import org.koin.dsl.module val assetManagerModule = module { single { FilesystemProjectManager() } single { FilesystemAssetManager() } + single { AssetsFileProcessorBackend() } } \ No newline at end of file diff --git a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/AssetsFileProcessorBackend.kt b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/AssetsFileProcessorBackend.kt new file mode 100644 index 0000000..567cccd --- /dev/null +++ b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/AssetsFileProcessorBackend.kt @@ -0,0 +1,22 @@ +package de.itkl.assetmanager.implementation + +import de.itkl.core_api.interfaces.FileProcessor2 +import de.itkl.core_api.interfaces.Resource +import de.itkl.core_api.interfaces.assets.Assets +import de.itkl.core_api.interfaces.assets.FileProcessorBackend +import io.github.oshai.kotlinlogging.KotlinLogging +import org.koin.core.component.KoinComponent + +private val Log = KotlinLogging.logger { } +class AssetsFileProcessorBackend : FileProcessorBackend, KoinComponent { + override suspend fun process(resource: Resource, assets: Assets, fileProcessor: FileProcessor2) { + Log.debug { "Call processor '${fileProcessor.filename}' on $resource" } + if (assets.exists(fileProcessor.filename)) { + Log.info { "${fileProcessor.filename} already exists on ${resource}. Skipping" } + } else { + Log.info { "${fileProcessor.filename} does not yet exists for $resource" } + val newResource = fileProcessor.process(resource) + assets.store(newResource) + } + } +} \ No newline at end of file diff --git a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/FilesystemAssetManager.kt b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/FilesystemAssetManager.kt index 5185097..aba83b3 100644 --- a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/FilesystemAssetManager.kt +++ b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/FilesystemAssetManager.kt @@ -1,7 +1,7 @@ package de.itkl.assetmanager.implementation import de.itkl.assetmanager.interfaces.AssetManager -import de.itkl.assetmanager.interfaces.Assets +import de.itkl.core_api.interfaces.assets.Assets import de.itkl.core_api.interfaces.Resource import de.itkl.core_api.interfaces.ResourceFactory import io.github.oshai.kotlinlogging.KotlinLogging diff --git a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/FilesystemProjectManager.kt b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/FilesystemProjectManager.kt index 671cbff..8d3827b 100644 --- a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/FilesystemProjectManager.kt +++ b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/implementation/FilesystemProjectManager.kt @@ -1,7 +1,7 @@ package de.itkl.assetmanager.implementation import de.itkl.assetmanager.interfaces.AssetManager -import de.itkl.assetmanager.interfaces.Assets +import de.itkl.core_api.interfaces.assets.Assets import de.itkl.assetmanager.interfaces.Project import de.itkl.assetmanager.interfaces.ProjectManager import de.itkl.core_api.interfaces.Resource diff --git a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/AssetManager.kt b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/AssetManager.kt index 5488ae2..d3b7a32 100644 --- a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/AssetManager.kt +++ b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/AssetManager.kt @@ -1,5 +1,7 @@ package de.itkl.assetmanager.interfaces +import de.itkl.core_api.interfaces.assets.Assets + /** * Manage the assets for one document */ diff --git a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/Assets.kt b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/Assets.kt deleted file mode 100644 index 38653e3..0000000 --- a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/Assets.kt +++ /dev/null @@ -1,10 +0,0 @@ -package de.itkl.assetmanager.interfaces - -import de.itkl.core_api.interfaces.Resource -import kotlinx.coroutines.flow.Flow - -interface Assets : Flow { - suspend fun store(resource: Resource) - suspend fun retrieve(name: String): Resource? - suspend fun delete(name: String) -} \ No newline at end of file diff --git a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/Project.kt b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/Project.kt index 20cbf6f..ca78725 100644 --- a/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/Project.kt +++ b/libraries/assetmanager/src/main/kotlin/de/itkl/assetmanager/interfaces/Project.kt @@ -1,6 +1,7 @@ package de.itkl.assetmanager.interfaces import de.itkl.core_api.interfaces.Resource +import de.itkl.core_api.interfaces.assets.Assets /** * A set of documents. Each can hold its own assets diff --git a/libraries/core-api/src/main/kotlin/de/itkl/core_api/implementation/SerializableResource.kt b/libraries/core-api/src/main/kotlin/de/itkl/core_api/implementation/SerializableResource.kt new file mode 100644 index 0000000..bd63594 --- /dev/null +++ b/libraries/core-api/src/main/kotlin/de/itkl/core_api/implementation/SerializableResource.kt @@ -0,0 +1,34 @@ +package de.itkl.core_api.implementation + +import de.itkl.core_api.interfaces.Resource +import io.ktor.http.* +import kotlinx.serialization.* +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.encodeToStream +import java.io.File +import java.io.InputStream +import java.io.UnsupportedEncodingException +import java.nio.file.Path + +class SerializableResource @OptIn(ExperimentalSerializationApi::class) constructor( + override val filename: String, + override val contentType: ContentType, + private val obj: T, + private val serializer: SerializationStrategy +) : Resource { + + override val length: Long? = null + override val file: File? = null + override val path: Path? = null + + override fun read(): InputStream { + return serialize().byteInputStream() + } + + private fun serialize(): String { + return when(contentType) { + ContentType.Application.Json -> Json.encodeToString(serializer, obj) + else -> throw UnsupportedEncodingException("Sorry but $contentType is not supported for Resources") + } + } +} \ No newline at end of file diff --git a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/FileProcessor.kt b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/FileProcessor.kt index 14c588a..9e169ec 100644 --- a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/FileProcessor.kt +++ b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/FileProcessor.kt @@ -2,8 +2,14 @@ package de.itkl.core_api.interfaces import java.io.File import java.nio.file.Path +import java.util.function.Consumer interface FileProcessor { fun willProduce(path: Path): Path suspend fun process(resource: Resource): File +} + +interface FileProcessor2 { + val filename: String + suspend fun process(resource: Resource): Resource } \ No newline at end of file diff --git a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/Resource.kt b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/Resource.kt index 52c51f0..7ea0d03 100644 --- a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/Resource.kt +++ b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/Resource.kt @@ -23,6 +23,7 @@ interface Resource { fun json(deserializer: DeserializationStrategy): T { return Json.decodeFromString(deserializer, read().readAllBytes().contentToString()) } + } @@ -36,4 +37,8 @@ abstract class AbstractResource : Resource, KoinComponent { final override fun read(): InputStream { return doRead() } + + override fun toString(): String { + return filename + } } \ No newline at end of file diff --git a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/ResourceFactory.kt b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/ResourceFactory.kt index 7853384..750baf9 100644 --- a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/ResourceFactory.kt +++ b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/ResourceFactory.kt @@ -2,6 +2,9 @@ package de.itkl.core_api.interfaces import de.itkl.core_api.implementation.FileResource import de.itkl.core_api.implementation.ProgressResource +import de.itkl.core_api.implementation.SerializableResource +import io.ktor.http.* +import kotlinx.serialization.SerializationStrategy import org.koin.core.component.KoinComponent import org.koin.core.component.inject import java.io.File @@ -11,7 +14,13 @@ import java.nio.file.Paths class ResourceFactory : KoinComponent { private val progressBarFactory by inject() - + fun json(name: String, obj: T, serializationStrategy: SerializationStrategy): Resource { + return SerializableResource( + filename = name, + contentType = ContentType.Application.Json, + obj = obj, + serializer = serializationStrategy) + } fun file(path: String): Resource { return file(Paths.get(path)) } diff --git a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/data/Processable.kt b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/data/Processable.kt index 7db75c3..d97e84d 100644 --- a/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/data/Processable.kt +++ b/libraries/core-api/src/main/kotlin/de/itkl/core_api/interfaces/data/Processable.kt @@ -1,7 +1,8 @@ package de.itkl.core_api.interfaces.data import de.itkl.core_api.interfaces.FileProcessor +import de.itkl.core_api.interfaces.FileProcessor2 interface Processable { - suspend fun process(fileProcessor: FileProcessor) + suspend fun process(fileProcessor: FileProcessor2) } \ No newline at end of file diff --git a/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcr.kt b/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcr.kt index a27e3d5..4ab0dff 100644 --- a/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcr.kt +++ b/libraries/httpClient/src/main/kotlin/de/itkl/httpClient/clients/MsOcr.kt @@ -2,12 +2,15 @@ package de.itkl.httpClient.clients import de.itkl.core_api.dtos.MsOcrResponse import de.itkl.core_api.interfaces.FileProcessor +import de.itkl.core_api.interfaces.FileProcessor2 import de.itkl.core_api.interfaces.Resource +import de.itkl.core_api.interfaces.ResourceFactory import io.github.oshai.kotlinlogging.KotlinLogging import io.ktor.client.* import io.ktor.client.call.* import io.ktor.client.request.* import io.ktor.client.statement.* +import io.ktor.client.utils.EmptyContent.contentType import io.ktor.http.* import kotlinx.serialization.json.Json import org.koin.core.component.KoinComponent @@ -18,8 +21,9 @@ import kotlin.io.path.nameWithoutExtension import kotlin.io.path.writeText private val Log = KotlinLogging.logger { } -class MsOcr: KoinComponent, FileProcessor { +class MsOcr: KoinComponent, FileProcessor2 { private val httpClient: HttpClient by inject() + private val resourceFactory: ResourceFactory by inject() suspend fun ocr(resource: Resource): MsOcrResponse { val response = httpClient.post { @@ -34,15 +38,10 @@ class MsOcr: KoinComponent, FileProcessor { return response.body() } - override fun willProduce(path: Path): Path { - return path.parent.resolve(path.nameWithoutExtension + ".ms-ocr.json") - } + override val filename = "ms-ocr.json" - override suspend fun process(resource: Resource): File { + override suspend fun process(resource: Resource): Resource { val result = ocr(resource) - val jsonString = Json.encodeToString(MsOcrResponse.serializer(), result) - val destination = willProduce(resource.path!!) - destination.writeText(jsonString) - return destination.toFile() + return resourceFactory.json(filename, result, MsOcrResponse.serializer()) } } \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Corpus.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Corpus.kt index 834c478..d9f4502 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Corpus.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Corpus.kt @@ -22,15 +22,12 @@ class CorpusFactory : KoinComponent { } } } -class Corpus(private val project: Project): Processable, KoinComponent { +class Corpus(private val project: Project): KoinComponent { val displayName get() = project.displayName val documentNames get() = project.documentNames private val resourceFactory: ResourceFactory by inject() - override suspend fun process(fileProcessor: FileProcessor) { - TODO("NEXT") - } suspend fun document(name: String): Document { return Document(name, listOf(project.resource(name)!!)) } diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentContainer.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Document.kt similarity index 82% rename from libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentContainer.kt rename to libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Document.kt index e0f6bfc..4f3fb37 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentContainer.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Document.kt @@ -3,7 +3,10 @@ package de.itkl.textprocessing import de.itkl.assetmanager.interfaces.AssetManager import de.itkl.core_api.dtos.MsOcrResponse import de.itkl.core_api.interfaces.FileProcessor +import de.itkl.core_api.interfaces.FileProcessor2 import de.itkl.core_api.interfaces.Resource +import de.itkl.core_api.interfaces.assets.Assets +import de.itkl.core_api.interfaces.assets.FileProcessorBackend import de.itkl.core_api.interfaces.data.Processable import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.asFlow @@ -18,6 +21,10 @@ class Document( val resources: List ) : Processable, KoinComponent { private val assetManager: AssetManager by inject() + private val fileProcessorBackend: FileProcessorBackend by inject() + suspend fun assets(): Assets { + return assetManager.assets(name) + } /** * Loads the extracted ocr pages. Note that not every pages @@ -25,7 +32,7 @@ class Document( */ suspend fun retrieveOcrPages(): List { // TODO: How to identify the assets independently from their name? - val resource = checkNotNull(assetManager.assets(name) + val resource = checkNotNull(assets() .retrieve("ms-ocr")) { "Ocr for $name is not yet created" } @@ -33,9 +40,12 @@ class Document( return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) } } - override suspend fun process(fileProcessor: FileProcessor) { - // TODO: Rework the whole fileprocessor. Should work indepently from a pipeline - fileProcessor.process(resources.first()) + override suspend fun process(fileProcessor: FileProcessor2) { + fileProcessorBackend.process( + resources.first(), + assets(), + fileProcessor + ) } private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage {