starting with fileprocessor2
parent
9ea725fc36
commit
2cab145008
|
|
@ -1,12 +1,15 @@
|
||||||
package de.itkl.assetmanager
|
package de.itkl.assetmanager
|
||||||
|
|
||||||
|
import de.itkl.assetmanager.implementation.AssetsFileProcessorBackend
|
||||||
import de.itkl.assetmanager.implementation.FilesystemAssetManager
|
import de.itkl.assetmanager.implementation.FilesystemAssetManager
|
||||||
import de.itkl.assetmanager.implementation.FilesystemProjectManager
|
import de.itkl.assetmanager.implementation.FilesystemProjectManager
|
||||||
import de.itkl.assetmanager.interfaces.AssetManager
|
import de.itkl.assetmanager.interfaces.AssetManager
|
||||||
import de.itkl.assetmanager.interfaces.ProjectManager
|
import de.itkl.assetmanager.interfaces.ProjectManager
|
||||||
|
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
|
||||||
import org.koin.dsl.module
|
import org.koin.dsl.module
|
||||||
|
|
||||||
val assetManagerModule = module {
|
val assetManagerModule = module {
|
||||||
single<ProjectManager> { FilesystemProjectManager() }
|
single<ProjectManager> { FilesystemProjectManager() }
|
||||||
single<AssetManager> { FilesystemAssetManager() }
|
single<AssetManager> { FilesystemAssetManager() }
|
||||||
|
single<FileProcessorBackend> { AssetsFileProcessorBackend() }
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
package de.itkl.assetmanager.implementation
|
||||||
|
|
||||||
|
import de.itkl.core_api.interfaces.FileProcessor2
|
||||||
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
import de.itkl.core_api.interfaces.assets.Assets
|
||||||
|
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
|
||||||
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
|
import org.koin.core.component.KoinComponent
|
||||||
|
|
||||||
|
private val Log = KotlinLogging.logger { }
|
||||||
|
class AssetsFileProcessorBackend : FileProcessorBackend, KoinComponent {
|
||||||
|
override suspend fun process(resource: Resource, assets: Assets, fileProcessor: FileProcessor2) {
|
||||||
|
Log.debug { "Call processor '${fileProcessor.filename}' on $resource" }
|
||||||
|
if (assets.exists(fileProcessor.filename)) {
|
||||||
|
Log.info { "${fileProcessor.filename} already exists on ${resource}. Skipping" }
|
||||||
|
} else {
|
||||||
|
Log.info { "${fileProcessor.filename} does not yet exists for $resource" }
|
||||||
|
val newResource = fileProcessor.process(resource)
|
||||||
|
assets.store(newResource)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
package de.itkl.assetmanager.implementation
|
package de.itkl.assetmanager.implementation
|
||||||
|
|
||||||
import de.itkl.assetmanager.interfaces.AssetManager
|
import de.itkl.assetmanager.interfaces.AssetManager
|
||||||
import de.itkl.assetmanager.interfaces.Assets
|
import de.itkl.core_api.interfaces.assets.Assets
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.core_api.interfaces.Resource
|
||||||
import de.itkl.core_api.interfaces.ResourceFactory
|
import de.itkl.core_api.interfaces.ResourceFactory
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
package de.itkl.assetmanager.implementation
|
package de.itkl.assetmanager.implementation
|
||||||
|
|
||||||
import de.itkl.assetmanager.interfaces.AssetManager
|
import de.itkl.assetmanager.interfaces.AssetManager
|
||||||
import de.itkl.assetmanager.interfaces.Assets
|
import de.itkl.core_api.interfaces.assets.Assets
|
||||||
import de.itkl.assetmanager.interfaces.Project
|
import de.itkl.assetmanager.interfaces.Project
|
||||||
import de.itkl.assetmanager.interfaces.ProjectManager
|
import de.itkl.assetmanager.interfaces.ProjectManager
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
package de.itkl.assetmanager.interfaces
|
package de.itkl.assetmanager.interfaces
|
||||||
|
|
||||||
|
import de.itkl.core_api.interfaces.assets.Assets
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Manage the assets for one document
|
* Manage the assets for one document
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
package de.itkl.assetmanager.interfaces
|
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.Resource
|
|
||||||
import kotlinx.coroutines.flow.Flow
|
|
||||||
|
|
||||||
interface Assets : Flow<Resource> {
|
|
||||||
suspend fun store(resource: Resource)
|
|
||||||
suspend fun retrieve(name: String): Resource?
|
|
||||||
suspend fun delete(name: String)
|
|
||||||
}
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package de.itkl.assetmanager.interfaces
|
package de.itkl.assetmanager.interfaces
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
import de.itkl.core_api.interfaces.assets.Assets
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A set of documents. Each can hold its own assets
|
* A set of documents. Each can hold its own assets
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
package de.itkl.core_api.implementation
|
||||||
|
|
||||||
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
import io.ktor.http.*
|
||||||
|
import kotlinx.serialization.*
|
||||||
|
import kotlinx.serialization.json.Json
|
||||||
|
import kotlinx.serialization.json.encodeToStream
|
||||||
|
import java.io.File
|
||||||
|
import java.io.InputStream
|
||||||
|
import java.io.UnsupportedEncodingException
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
class SerializableResource<T : Any> @OptIn(ExperimentalSerializationApi::class) constructor(
|
||||||
|
override val filename: String,
|
||||||
|
override val contentType: ContentType,
|
||||||
|
private val obj: T,
|
||||||
|
private val serializer: SerializationStrategy<T>
|
||||||
|
) : Resource {
|
||||||
|
|
||||||
|
override val length: Long? = null
|
||||||
|
override val file: File? = null
|
||||||
|
override val path: Path? = null
|
||||||
|
|
||||||
|
override fun read(): InputStream {
|
||||||
|
return serialize().byteInputStream()
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun serialize(): String {
|
||||||
|
return when(contentType) {
|
||||||
|
ContentType.Application.Json -> Json.encodeToString(serializer, obj)
|
||||||
|
else -> throw UnsupportedEncodingException("Sorry but $contentType is not supported for Resources")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -2,8 +2,14 @@ package de.itkl.core_api.interfaces
|
||||||
|
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
import java.util.function.Consumer
|
||||||
|
|
||||||
interface FileProcessor {
|
interface FileProcessor {
|
||||||
fun willProduce(path: Path): Path
|
fun willProduce(path: Path): Path
|
||||||
suspend fun process(resource: Resource): File
|
suspend fun process(resource: Resource): File
|
||||||
|
}
|
||||||
|
|
||||||
|
interface FileProcessor2 {
|
||||||
|
val filename: String
|
||||||
|
suspend fun process(resource: Resource): Resource
|
||||||
}
|
}
|
||||||
|
|
@ -23,6 +23,7 @@ interface Resource {
|
||||||
fun <T: Any> json(deserializer: DeserializationStrategy<T>): T {
|
fun <T: Any> json(deserializer: DeserializationStrategy<T>): T {
|
||||||
return Json.decodeFromString(deserializer, read().readAllBytes().contentToString())
|
return Json.decodeFromString(deserializer, read().readAllBytes().contentToString())
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -36,4 +37,8 @@ abstract class AbstractResource : Resource, KoinComponent {
|
||||||
final override fun read(): InputStream {
|
final override fun read(): InputStream {
|
||||||
return doRead()
|
return doRead()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
override fun toString(): String {
|
||||||
|
return filename
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2,6 +2,9 @@ package de.itkl.core_api.interfaces
|
||||||
|
|
||||||
import de.itkl.core_api.implementation.FileResource
|
import de.itkl.core_api.implementation.FileResource
|
||||||
import de.itkl.core_api.implementation.ProgressResource
|
import de.itkl.core_api.implementation.ProgressResource
|
||||||
|
import de.itkl.core_api.implementation.SerializableResource
|
||||||
|
import io.ktor.http.*
|
||||||
|
import kotlinx.serialization.SerializationStrategy
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
import org.koin.core.component.inject
|
import org.koin.core.component.inject
|
||||||
import java.io.File
|
import java.io.File
|
||||||
|
|
@ -11,7 +14,13 @@ import java.nio.file.Paths
|
||||||
class ResourceFactory : KoinComponent {
|
class ResourceFactory : KoinComponent {
|
||||||
|
|
||||||
private val progressBarFactory by inject<ProgressBarFactory>()
|
private val progressBarFactory by inject<ProgressBarFactory>()
|
||||||
|
fun <T : Any> json(name: String, obj: T, serializationStrategy: SerializationStrategy<T>): Resource {
|
||||||
|
return SerializableResource<T>(
|
||||||
|
filename = name,
|
||||||
|
contentType = ContentType.Application.Json,
|
||||||
|
obj = obj,
|
||||||
|
serializer = serializationStrategy)
|
||||||
|
}
|
||||||
fun file(path: String): Resource {
|
fun file(path: String): Resource {
|
||||||
return file(Paths.get(path))
|
return file(Paths.get(path))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,8 @@
|
||||||
package de.itkl.core_api.interfaces.data
|
package de.itkl.core_api.interfaces.data
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.FileProcessor
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
|
import de.itkl.core_api.interfaces.FileProcessor2
|
||||||
|
|
||||||
interface Processable {
|
interface Processable {
|
||||||
suspend fun process(fileProcessor: FileProcessor)
|
suspend fun process(fileProcessor: FileProcessor2)
|
||||||
}
|
}
|
||||||
|
|
@ -2,12 +2,15 @@ package de.itkl.httpClient.clients
|
||||||
|
|
||||||
import de.itkl.core_api.dtos.MsOcrResponse
|
import de.itkl.core_api.dtos.MsOcrResponse
|
||||||
import de.itkl.core_api.interfaces.FileProcessor
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
|
import de.itkl.core_api.interfaces.FileProcessor2
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
import de.itkl.core_api.interfaces.ResourceFactory
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
import io.ktor.client.*
|
import io.ktor.client.*
|
||||||
import io.ktor.client.call.*
|
import io.ktor.client.call.*
|
||||||
import io.ktor.client.request.*
|
import io.ktor.client.request.*
|
||||||
import io.ktor.client.statement.*
|
import io.ktor.client.statement.*
|
||||||
|
import io.ktor.client.utils.EmptyContent.contentType
|
||||||
import io.ktor.http.*
|
import io.ktor.http.*
|
||||||
import kotlinx.serialization.json.Json
|
import kotlinx.serialization.json.Json
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
|
|
@ -18,8 +21,9 @@ import kotlin.io.path.nameWithoutExtension
|
||||||
import kotlin.io.path.writeText
|
import kotlin.io.path.writeText
|
||||||
|
|
||||||
private val Log = KotlinLogging.logger { }
|
private val Log = KotlinLogging.logger { }
|
||||||
class MsOcr: KoinComponent, FileProcessor {
|
class MsOcr: KoinComponent, FileProcessor2 {
|
||||||
private val httpClient: HttpClient by inject()
|
private val httpClient: HttpClient by inject()
|
||||||
|
private val resourceFactory: ResourceFactory by inject()
|
||||||
|
|
||||||
suspend fun ocr(resource: Resource): MsOcrResponse {
|
suspend fun ocr(resource: Resource): MsOcrResponse {
|
||||||
val response = httpClient.post {
|
val response = httpClient.post {
|
||||||
|
|
@ -34,15 +38,10 @@ class MsOcr: KoinComponent, FileProcessor {
|
||||||
return response.body()
|
return response.body()
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun willProduce(path: Path): Path {
|
override val filename = "ms-ocr.json"
|
||||||
return path.parent.resolve(path.nameWithoutExtension + ".ms-ocr.json")
|
|
||||||
}
|
|
||||||
|
|
||||||
override suspend fun process(resource: Resource): File {
|
override suspend fun process(resource: Resource): Resource {
|
||||||
val result = ocr(resource)
|
val result = ocr(resource)
|
||||||
val jsonString = Json.encodeToString(MsOcrResponse.serializer(), result)
|
return resourceFactory.json(filename, result, MsOcrResponse.serializer())
|
||||||
val destination = willProduce(resource.path!!)
|
|
||||||
destination.writeText(jsonString)
|
|
||||||
return destination.toFile()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -22,15 +22,12 @@ class CorpusFactory : KoinComponent {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
class Corpus(private val project: Project): Processable, KoinComponent {
|
class Corpus(private val project: Project): KoinComponent {
|
||||||
val displayName get() = project.displayName
|
val displayName get() = project.displayName
|
||||||
val documentNames get() = project.documentNames
|
val documentNames get() = project.documentNames
|
||||||
|
|
||||||
private val resourceFactory: ResourceFactory by inject()
|
private val resourceFactory: ResourceFactory by inject()
|
||||||
|
|
||||||
override suspend fun process(fileProcessor: FileProcessor) {
|
|
||||||
TODO("NEXT")
|
|
||||||
}
|
|
||||||
suspend fun document(name: String): Document {
|
suspend fun document(name: String): Document {
|
||||||
return Document(name, listOf(project.resource(name)!!))
|
return Document(name, listOf(project.resource(name)!!))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,10 @@ package de.itkl.textprocessing
|
||||||
import de.itkl.assetmanager.interfaces.AssetManager
|
import de.itkl.assetmanager.interfaces.AssetManager
|
||||||
import de.itkl.core_api.dtos.MsOcrResponse
|
import de.itkl.core_api.dtos.MsOcrResponse
|
||||||
import de.itkl.core_api.interfaces.FileProcessor
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
|
import de.itkl.core_api.interfaces.FileProcessor2
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
import de.itkl.core_api.interfaces.assets.Assets
|
||||||
|
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
|
||||||
import de.itkl.core_api.interfaces.data.Processable
|
import de.itkl.core_api.interfaces.data.Processable
|
||||||
import kotlinx.coroutines.flow.Flow
|
import kotlinx.coroutines.flow.Flow
|
||||||
import kotlinx.coroutines.flow.asFlow
|
import kotlinx.coroutines.flow.asFlow
|
||||||
|
|
@ -18,6 +21,10 @@ class Document(
|
||||||
val resources: List<Resource>
|
val resources: List<Resource>
|
||||||
) : Processable, KoinComponent {
|
) : Processable, KoinComponent {
|
||||||
private val assetManager: AssetManager by inject()
|
private val assetManager: AssetManager by inject()
|
||||||
|
private val fileProcessorBackend: FileProcessorBackend by inject()
|
||||||
|
suspend fun assets(): Assets {
|
||||||
|
return assetManager.assets(name)
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads the extracted ocr pages. Note that not every pages
|
* Loads the extracted ocr pages. Note that not every pages
|
||||||
|
|
@ -25,7 +32,7 @@ class Document(
|
||||||
*/
|
*/
|
||||||
suspend fun retrieveOcrPages(): List<OcrPage> {
|
suspend fun retrieveOcrPages(): List<OcrPage> {
|
||||||
// TODO: How to identify the assets independently from their name?
|
// TODO: How to identify the assets independently from their name?
|
||||||
val resource = checkNotNull(assetManager.assets(name)
|
val resource = checkNotNull(assets()
|
||||||
.retrieve("ms-ocr")) {
|
.retrieve("ms-ocr")) {
|
||||||
"Ocr for $name is not yet created"
|
"Ocr for $name is not yet created"
|
||||||
}
|
}
|
||||||
|
|
@ -33,9 +40,12 @@ class Document(
|
||||||
|
|
||||||
return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) }
|
return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) }
|
||||||
}
|
}
|
||||||
override suspend fun process(fileProcessor: FileProcessor) {
|
override suspend fun process(fileProcessor: FileProcessor2) {
|
||||||
// TODO: Rework the whole fileprocessor. Should work indepently from a pipeline
|
fileProcessorBackend.process(
|
||||||
fileProcessor.process(resources.first())
|
resources.first(),
|
||||||
|
assets(),
|
||||||
|
fileProcessor
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage {
|
private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage {
|
||||||
Loading…
Reference in New Issue