starting with fileprocessor2
parent
9ea725fc36
commit
2cab145008
|
|
@ -1,12 +1,15 @@
|
|||
package de.itkl.assetmanager
|
||||
|
||||
import de.itkl.assetmanager.implementation.AssetsFileProcessorBackend
|
||||
import de.itkl.assetmanager.implementation.FilesystemAssetManager
|
||||
import de.itkl.assetmanager.implementation.FilesystemProjectManager
|
||||
import de.itkl.assetmanager.interfaces.AssetManager
|
||||
import de.itkl.assetmanager.interfaces.ProjectManager
|
||||
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
|
||||
import org.koin.dsl.module
|
||||
|
||||
val assetManagerModule = module {
|
||||
single<ProjectManager> { FilesystemProjectManager() }
|
||||
single<AssetManager> { FilesystemAssetManager() }
|
||||
single<FileProcessorBackend> { AssetsFileProcessorBackend() }
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package de.itkl.assetmanager.implementation
|
||||
|
||||
import de.itkl.core_api.interfaces.FileProcessor2
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.core_api.interfaces.assets.Assets
|
||||
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import org.koin.core.component.KoinComponent
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
class AssetsFileProcessorBackend : FileProcessorBackend, KoinComponent {
|
||||
override suspend fun process(resource: Resource, assets: Assets, fileProcessor: FileProcessor2) {
|
||||
Log.debug { "Call processor '${fileProcessor.filename}' on $resource" }
|
||||
if (assets.exists(fileProcessor.filename)) {
|
||||
Log.info { "${fileProcessor.filename} already exists on ${resource}. Skipping" }
|
||||
} else {
|
||||
Log.info { "${fileProcessor.filename} does not yet exists for $resource" }
|
||||
val newResource = fileProcessor.process(resource)
|
||||
assets.store(newResource)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
package de.itkl.assetmanager.implementation
|
||||
|
||||
import de.itkl.assetmanager.interfaces.AssetManager
|
||||
import de.itkl.assetmanager.interfaces.Assets
|
||||
import de.itkl.core_api.interfaces.assets.Assets
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.core_api.interfaces.ResourceFactory
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
package de.itkl.assetmanager.implementation
|
||||
|
||||
import de.itkl.assetmanager.interfaces.AssetManager
|
||||
import de.itkl.assetmanager.interfaces.Assets
|
||||
import de.itkl.core_api.interfaces.assets.Assets
|
||||
import de.itkl.assetmanager.interfaces.Project
|
||||
import de.itkl.assetmanager.interfaces.ProjectManager
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package de.itkl.assetmanager.interfaces
|
||||
|
||||
import de.itkl.core_api.interfaces.assets.Assets
|
||||
|
||||
/**
|
||||
* Manage the assets for one document
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1,10 +0,0 @@
|
|||
package de.itkl.assetmanager.interfaces
|
||||
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
|
||||
interface Assets : Flow<Resource> {
|
||||
suspend fun store(resource: Resource)
|
||||
suspend fun retrieve(name: String): Resource?
|
||||
suspend fun delete(name: String)
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
package de.itkl.assetmanager.interfaces
|
||||
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.core_api.interfaces.assets.Assets
|
||||
|
||||
/**
|
||||
* A set of documents. Each can hold its own assets
|
||||
|
|
|
|||
|
|
@ -0,0 +1,34 @@
|
|||
package de.itkl.core_api.implementation
|
||||
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import io.ktor.http.*
|
||||
import kotlinx.serialization.*
|
||||
import kotlinx.serialization.json.Json
|
||||
import kotlinx.serialization.json.encodeToStream
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.io.UnsupportedEncodingException
|
||||
import java.nio.file.Path
|
||||
|
||||
class SerializableResource<T : Any> @OptIn(ExperimentalSerializationApi::class) constructor(
|
||||
override val filename: String,
|
||||
override val contentType: ContentType,
|
||||
private val obj: T,
|
||||
private val serializer: SerializationStrategy<T>
|
||||
) : Resource {
|
||||
|
||||
override val length: Long? = null
|
||||
override val file: File? = null
|
||||
override val path: Path? = null
|
||||
|
||||
override fun read(): InputStream {
|
||||
return serialize().byteInputStream()
|
||||
}
|
||||
|
||||
private fun serialize(): String {
|
||||
return when(contentType) {
|
||||
ContentType.Application.Json -> Json.encodeToString(serializer, obj)
|
||||
else -> throw UnsupportedEncodingException("Sorry but $contentType is not supported for Resources")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -2,8 +2,14 @@ package de.itkl.core_api.interfaces
|
|||
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
import java.util.function.Consumer
|
||||
|
||||
interface FileProcessor {
|
||||
fun willProduce(path: Path): Path
|
||||
suspend fun process(resource: Resource): File
|
||||
}
|
||||
|
||||
interface FileProcessor2 {
|
||||
val filename: String
|
||||
suspend fun process(resource: Resource): Resource
|
||||
}
|
||||
|
|
@ -23,6 +23,7 @@ interface Resource {
|
|||
fun <T: Any> json(deserializer: DeserializationStrategy<T>): T {
|
||||
return Json.decodeFromString(deserializer, read().readAllBytes().contentToString())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -36,4 +37,8 @@ abstract class AbstractResource : Resource, KoinComponent {
|
|||
final override fun read(): InputStream {
|
||||
return doRead()
|
||||
}
|
||||
|
||||
override fun toString(): String {
|
||||
return filename
|
||||
}
|
||||
}
|
||||
|
|
@ -2,6 +2,9 @@ package de.itkl.core_api.interfaces
|
|||
|
||||
import de.itkl.core_api.implementation.FileResource
|
||||
import de.itkl.core_api.implementation.ProgressResource
|
||||
import de.itkl.core_api.implementation.SerializableResource
|
||||
import io.ktor.http.*
|
||||
import kotlinx.serialization.SerializationStrategy
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
import java.io.File
|
||||
|
|
@ -11,7 +14,13 @@ import java.nio.file.Paths
|
|||
class ResourceFactory : KoinComponent {
|
||||
|
||||
private val progressBarFactory by inject<ProgressBarFactory>()
|
||||
|
||||
fun <T : Any> json(name: String, obj: T, serializationStrategy: SerializationStrategy<T>): Resource {
|
||||
return SerializableResource<T>(
|
||||
filename = name,
|
||||
contentType = ContentType.Application.Json,
|
||||
obj = obj,
|
||||
serializer = serializationStrategy)
|
||||
}
|
||||
fun file(path: String): Resource {
|
||||
return file(Paths.get(path))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
package de.itkl.core_api.interfaces.data
|
||||
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.FileProcessor2
|
||||
|
||||
interface Processable {
|
||||
suspend fun process(fileProcessor: FileProcessor)
|
||||
suspend fun process(fileProcessor: FileProcessor2)
|
||||
}
|
||||
|
|
@ -2,12 +2,15 @@ package de.itkl.httpClient.clients
|
|||
|
||||
import de.itkl.core_api.dtos.MsOcrResponse
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.FileProcessor2
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.core_api.interfaces.ResourceFactory
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import io.ktor.client.*
|
||||
import io.ktor.client.call.*
|
||||
import io.ktor.client.request.*
|
||||
import io.ktor.client.statement.*
|
||||
import io.ktor.client.utils.EmptyContent.contentType
|
||||
import io.ktor.http.*
|
||||
import kotlinx.serialization.json.Json
|
||||
import org.koin.core.component.KoinComponent
|
||||
|
|
@ -18,8 +21,9 @@ import kotlin.io.path.nameWithoutExtension
|
|||
import kotlin.io.path.writeText
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
class MsOcr: KoinComponent, FileProcessor {
|
||||
class MsOcr: KoinComponent, FileProcessor2 {
|
||||
private val httpClient: HttpClient by inject()
|
||||
private val resourceFactory: ResourceFactory by inject()
|
||||
|
||||
suspend fun ocr(resource: Resource): MsOcrResponse {
|
||||
val response = httpClient.post {
|
||||
|
|
@ -34,15 +38,10 @@ class MsOcr: KoinComponent, FileProcessor {
|
|||
return response.body()
|
||||
}
|
||||
|
||||
override fun willProduce(path: Path): Path {
|
||||
return path.parent.resolve(path.nameWithoutExtension + ".ms-ocr.json")
|
||||
}
|
||||
override val filename = "ms-ocr.json"
|
||||
|
||||
override suspend fun process(resource: Resource): File {
|
||||
override suspend fun process(resource: Resource): Resource {
|
||||
val result = ocr(resource)
|
||||
val jsonString = Json.encodeToString(MsOcrResponse.serializer(), result)
|
||||
val destination = willProduce(resource.path!!)
|
||||
destination.writeText(jsonString)
|
||||
return destination.toFile()
|
||||
return resourceFactory.json(filename, result, MsOcrResponse.serializer())
|
||||
}
|
||||
}
|
||||
|
|
@ -22,15 +22,12 @@ class CorpusFactory : KoinComponent {
|
|||
}
|
||||
}
|
||||
}
|
||||
class Corpus(private val project: Project): Processable, KoinComponent {
|
||||
class Corpus(private val project: Project): KoinComponent {
|
||||
val displayName get() = project.displayName
|
||||
val documentNames get() = project.documentNames
|
||||
|
||||
private val resourceFactory: ResourceFactory by inject()
|
||||
|
||||
override suspend fun process(fileProcessor: FileProcessor) {
|
||||
TODO("NEXT")
|
||||
}
|
||||
suspend fun document(name: String): Document {
|
||||
return Document(name, listOf(project.resource(name)!!))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,10 @@ package de.itkl.textprocessing
|
|||
import de.itkl.assetmanager.interfaces.AssetManager
|
||||
import de.itkl.core_api.dtos.MsOcrResponse
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.FileProcessor2
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.core_api.interfaces.assets.Assets
|
||||
import de.itkl.core_api.interfaces.assets.FileProcessorBackend
|
||||
import de.itkl.core_api.interfaces.data.Processable
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
import kotlinx.coroutines.flow.asFlow
|
||||
|
|
@ -18,6 +21,10 @@ class Document(
|
|||
val resources: List<Resource>
|
||||
) : Processable, KoinComponent {
|
||||
private val assetManager: AssetManager by inject()
|
||||
private val fileProcessorBackend: FileProcessorBackend by inject()
|
||||
suspend fun assets(): Assets {
|
||||
return assetManager.assets(name)
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the extracted ocr pages. Note that not every pages
|
||||
|
|
@ -25,7 +32,7 @@ class Document(
|
|||
*/
|
||||
suspend fun retrieveOcrPages(): List<OcrPage> {
|
||||
// TODO: How to identify the assets independently from their name?
|
||||
val resource = checkNotNull(assetManager.assets(name)
|
||||
val resource = checkNotNull(assets()
|
||||
.retrieve("ms-ocr")) {
|
||||
"Ocr for $name is not yet created"
|
||||
}
|
||||
|
|
@ -33,9 +40,12 @@ class Document(
|
|||
|
||||
return msOcrResponse.analyzeResult.readResults.map { toOcrPage(it) }
|
||||
}
|
||||
override suspend fun process(fileProcessor: FileProcessor) {
|
||||
// TODO: Rework the whole fileprocessor. Should work indepently from a pipeline
|
||||
fileProcessor.process(resources.first())
|
||||
override suspend fun process(fileProcessor: FileProcessor2) {
|
||||
fileProcessorBackend.process(
|
||||
resources.first(),
|
||||
assets(),
|
||||
fileProcessor
|
||||
)
|
||||
}
|
||||
|
||||
private fun toOcrPage(readResult: MsOcrResponse.AnalyzeResult.ReadResult): OcrPage {
|
||||
Loading…
Reference in New Issue