Compare commits
No commits in common. "9f3813a83abe26d9a0da9f3402ae0a233b100597" and "2deaa204c5ef3d10db58047ccf7c1c2121b29fdd" have entirely different histories.
9f3813a83a
...
2deaa204c5
|
|
@ -1,36 +0,0 @@
|
||||||
<component name="ProjectRunConfigurationManager">
|
|
||||||
<configuration default="false" name="docthor [clean]" type="GradleRunConfiguration" factoryName="Gradle" nameIsGenerated="true">
|
|
||||||
<ExternalSystemSettings>
|
|
||||||
<option name="executionName" />
|
|
||||||
<option name="externalProjectPath" value="$PROJECT_DIR$" />
|
|
||||||
<option name="externalSystemIdString" value="GRADLE" />
|
|
||||||
<option name="scriptParameters" value="" />
|
|
||||||
<option name="taskDescriptions">
|
|
||||||
<list />
|
|
||||||
</option>
|
|
||||||
<option name="taskNames">
|
|
||||||
<list>
|
|
||||||
<option value="clean" />
|
|
||||||
</list>
|
|
||||||
</option>
|
|
||||||
<option name="vmOptions" />
|
|
||||||
</ExternalSystemSettings>
|
|
||||||
<ExternalSystemDebugServerProcess>true</ExternalSystemDebugServerProcess>
|
|
||||||
<ExternalSystemReattachDebugProcess>true</ExternalSystemReattachDebugProcess>
|
|
||||||
<EXTENSION ID="com.intellij.execution.ExternalSystemRunConfigurationJavaExtension">
|
|
||||||
<extension name="net.ashald.envfile">
|
|
||||||
<option name="IS_ENABLED" value="false" />
|
|
||||||
<option name="IS_SUBST" value="false" />
|
|
||||||
<option name="IS_PATH_MACRO_SUPPORTED" value="false" />
|
|
||||||
<option name="IS_IGNORE_MISSING_FILES" value="false" />
|
|
||||||
<option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
|
|
||||||
<ENTRIES>
|
|
||||||
<ENTRY IS_ENABLED="true" PARSER="runconfig" IS_EXECUTABLE="false" />
|
|
||||||
</ENTRIES>
|
|
||||||
</extension>
|
|
||||||
</EXTENSION>
|
|
||||||
<DebugAllEnabled>false</DebugAllEnabled>
|
|
||||||
<RunAsTest>false</RunAsTest>
|
|
||||||
<method v="2" />
|
|
||||||
</configuration>
|
|
||||||
</component>
|
|
||||||
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
<instance-profile id="d"
|
<instance-profile id="d"
|
||||||
name="Docthor"
|
name="Docthor"
|
||||||
start-page="docthor.md">
|
start-page="starter-topic.md">
|
||||||
|
|
||||||
<toc-element topic="docthor.md"/>
|
<toc-element topic="starter-topic.md"/>
|
||||||
</instance-profile>
|
</instance-profile>
|
||||||
|
|
@ -21,12 +21,4 @@ All libraries should be placed unter <path>libraries</path>
|
||||||
<def title="io">
|
<def title="io">
|
||||||
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
|
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
|
||||||
</def>
|
</def>
|
||||||
<def title="core-api">
|
|
||||||
Defines the core interfaces
|
|
||||||
</def>
|
|
||||||
<def title="tui">
|
|
||||||
Provides tui capabilities. When applied as koin modules
|
|
||||||
the resources will automatically print a read/write progressbar
|
|
||||||
on terminal.
|
|
||||||
</def>
|
|
||||||
</deflist>
|
</deflist>
|
||||||
|
|
@ -4,7 +4,6 @@ plugins {
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation(project(":libraries:tfidf"))
|
implementation(project(":libraries:tfidf"))
|
||||||
implementation(project(":libraries:tui"))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
|
|
|
||||||
|
|
@ -6,13 +6,15 @@ import com.github.ajalt.clikt.parameters.options.option
|
||||||
import com.github.ajalt.clikt.parameters.options.required
|
import com.github.ajalt.clikt.parameters.options.required
|
||||||
import com.github.ajalt.clikt.parameters.types.enum
|
import com.github.ajalt.clikt.parameters.types.enum
|
||||||
import com.github.ajalt.clikt.parameters.types.file
|
import com.github.ajalt.clikt.parameters.types.file
|
||||||
import de.itkl.core_api.coreApiModule
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
import de.itkl.textprocessing.textProcessingModule
|
import de.itkl.textprocessing.textProcessingModule
|
||||||
import de.itkl.tfidf.Language
|
import de.itkl.tfidf.Language
|
||||||
|
import de.itkl.tfidf.TerminalProgressBarFactory
|
||||||
|
//import de.itkl.tfidf.TfIdf
|
||||||
import de.itkl.tfidf.TfIdfPipeline
|
import de.itkl.tfidf.TfIdfPipeline
|
||||||
import de.itkl.tui.tuiModule
|
|
||||||
import kotlinx.coroutines.runBlocking
|
import kotlinx.coroutines.runBlocking
|
||||||
import org.koin.core.context.startKoin
|
import org.koin.core.context.startKoin
|
||||||
|
import org.koin.dsl.module
|
||||||
|
|
||||||
class ComputeIdf : CliktCommand() {
|
class ComputeIdf : CliktCommand() {
|
||||||
private val corpus by option(help = "corpus")
|
private val corpus by option(help = "corpus")
|
||||||
|
|
@ -31,9 +33,12 @@ class ComputeIdf : CliktCommand() {
|
||||||
fun main(args: Array<String>) {
|
fun main(args: Array<String>) {
|
||||||
startKoin {
|
startKoin {
|
||||||
modules(
|
modules(
|
||||||
coreApiModule,
|
|
||||||
textProcessingModule,
|
textProcessingModule,
|
||||||
tuiModule)
|
module {
|
||||||
|
single<ProgressBarFactory> {
|
||||||
|
TerminalProgressBarFactory()
|
||||||
|
}
|
||||||
|
})
|
||||||
ComputeIdf().main(args)
|
ComputeIdf().main(args)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,5 +7,5 @@ repositories {
|
||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:$embeddedKotlinVersion")
|
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.20")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -13,11 +13,6 @@ dependencies {
|
||||||
val koin_version = "3.5.3"
|
val koin_version = "3.5.3"
|
||||||
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
|
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
|
||||||
implementation("io.insert-koin:koin-core:$koin_version")
|
implementation("io.insert-koin:koin-core:$koin_version")
|
||||||
implementation("org.jetbrains.kotlinx:kotlinx-datetime:0.5.0")
|
|
||||||
implementation("org.jetbrains.kotlinx:kotlinx-serialization-json:1.6.2")
|
|
||||||
|
|
||||||
|
|
||||||
testImplementation("io.insert-koin:koin-test:$koin_version")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
package de.itkl.clients
|
||||||
|
|
||||||
|
class MsOcr {
|
||||||
|
|
||||||
|
suspend fun ocr() {}
|
||||||
|
}
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
package de.itkl.core_api
|
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.NoopResourceReadDecorator
|
|
||||||
import de.itkl.core_api.interfaces.ResourceFactory
|
|
||||||
import de.itkl.core_api.interfaces.ResourceReadDecorator
|
|
||||||
import org.koin.dsl.module
|
|
||||||
|
|
||||||
val coreApiModule = module {
|
|
||||||
single<ResourceFactory> { ResourceFactory()}
|
|
||||||
single<ResourceReadDecorator> { NoopResourceReadDecorator() }
|
|
||||||
}
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
||||||
package de.itkl.core_api.implementation
|
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.AbstractResource
|
|
||||||
import io.ktor.http.*
|
|
||||||
import java.io.File
|
|
||||||
import java.io.InputStream
|
|
||||||
import java.nio.file.Files
|
|
||||||
import java.nio.file.Path
|
|
||||||
import kotlin.io.path.name
|
|
||||||
|
|
||||||
class FileResource(override val path: Path) : AbstractResource() {
|
|
||||||
constructor(file: File): this(file.toPath())
|
|
||||||
override val length: Long by lazy { path.toFile().length() }
|
|
||||||
override val file: File?
|
|
||||||
get() = path.toFile()
|
|
||||||
|
|
||||||
override fun doRead(): InputStream {
|
|
||||||
return Files.newInputStream(path)
|
|
||||||
}
|
|
||||||
override val filename: String
|
|
||||||
get() = path.name
|
|
||||||
override val contentType: ContentType
|
|
||||||
get() = ContentType.fromFilePath(path.name).first()
|
|
||||||
}
|
|
||||||
|
|
@ -1,19 +0,0 @@
|
||||||
package de.itkl.core_api.implementation
|
|
||||||
|
|
||||||
import de.itkl.core_api.implementation.ProgressInputStream
|
|
||||||
import de.itkl.core_api.interfaces.ProgressBarFactory
|
|
||||||
import de.itkl.core_api.interfaces.Resource
|
|
||||||
import java.io.InputStream
|
|
||||||
|
|
||||||
internal class ProgressResource(
|
|
||||||
private val resource: Resource,
|
|
||||||
private val progressBarFactory: ProgressBarFactory
|
|
||||||
) : Resource by resource
|
|
||||||
{
|
|
||||||
override fun read(): InputStream {
|
|
||||||
return ProgressInputStream(
|
|
||||||
resource.read(),
|
|
||||||
progressBarFactory.new(this)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package de.itkl.core_api.interfaces
|
package de.itkl.core_api.interfaces
|
||||||
|
|
||||||
import java.io.File
|
import java.io.File
|
||||||
|
import java.io.InputStream
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
||||||
interface FileProcessor {
|
interface FileProcessor {
|
||||||
|
|
|
||||||
|
|
@ -3,32 +3,20 @@ package de.itkl.core_api.interfaces
|
||||||
import io.ktor.http.*
|
import io.ktor.http.*
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
import org.koin.core.component.get
|
import org.koin.core.component.get
|
||||||
import java.io.File
|
|
||||||
import java.io.InputStream
|
import java.io.InputStream
|
||||||
import java.nio.file.Path
|
|
||||||
|
|
||||||
interface Resource {
|
abstract class Resource : KoinComponent {
|
||||||
val filename: String
|
abstract val filename: String
|
||||||
val contentType: ContentType
|
abstract val contentType: ContentType
|
||||||
// TODO: Find a better method to avoid those nulls. Maybe subtyping the interface
|
abstract val length: Long?
|
||||||
val length: Long?
|
|
||||||
val file: File?
|
|
||||||
val path: Path?
|
|
||||||
fun read(): InputStream
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
protected abstract fun doRead(): InputStream
|
||||||
* Automatically adds koin injectable decorators to reading/writing
|
fun read(): InputStream {
|
||||||
* operations
|
|
||||||
*/
|
|
||||||
abstract class AbstractResource : Resource, KoinComponent {
|
|
||||||
abstract fun doRead(): InputStream
|
|
||||||
final override fun read(): InputStream {
|
|
||||||
return length?.let { length ->
|
return length?.let { length ->
|
||||||
get<ResourceReadDecorator>().decorate(
|
get<ResourceReadDecorator>().decorate(
|
||||||
length = length,
|
length = length,
|
||||||
doRead()
|
read()
|
||||||
)
|
)
|
||||||
} ?: doRead()
|
} ?: read()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,16 +0,0 @@
|
||||||
package de.itkl.core_api.interfaces
|
|
||||||
|
|
||||||
import de.itkl.core_api.implementation.FileResource
|
|
||||||
import de.itkl.core_api.implementation.ProgressResource
|
|
||||||
import org.koin.core.component.KoinComponent
|
|
||||||
import org.koin.core.component.inject
|
|
||||||
import java.io.File
|
|
||||||
|
|
||||||
class ResourceFactory : KoinComponent {
|
|
||||||
|
|
||||||
private val progressBarFactory by inject<ProgressBarFactory>()
|
|
||||||
fun file(file: File): Resource {
|
|
||||||
val resource = FileResource(file)
|
|
||||||
return ProgressResource(resource, progressBarFactory)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
package de.itkl.core_api.interfaces.data
|
|
||||||
interface DataTable : Iterable<List<String>> {
|
|
||||||
val columns: List<String>
|
|
||||||
}
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
package de.itkl.fileprocessing
|
package de.itkl.fileprocessing
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.FileProcessor
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
import de.itkl.core_api.interfaces.ResourceFactory
|
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
import org.koin.core.component.inject
|
import org.koin.core.component.inject
|
||||||
|
|
@ -11,9 +10,10 @@ import kotlin.io.path.exists
|
||||||
private val Log = KotlinLogging.logger { }
|
private val Log = KotlinLogging.logger { }
|
||||||
|
|
||||||
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
|
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
|
||||||
private val resourceFactory: ResourceFactory by inject()
|
|
||||||
|
|
||||||
protected abstract val fileProcessor: List<FileProcessor>
|
protected abstract val fileProcessor: List<FileProcessor>
|
||||||
|
private val progressBarFactory: ProgressBarFactory by inject()
|
||||||
suspend fun input(file: File) {
|
suspend fun input(file: File) {
|
||||||
var currentFile = file
|
var currentFile = file
|
||||||
fileProcessor.forEach { processor ->
|
fileProcessor.forEach { processor ->
|
||||||
|
|
@ -22,8 +22,9 @@ abstract class FileProcessingPipeline(private val force: Boolean = false) : Koin
|
||||||
Log.info { "$target exists. Skipping" }
|
Log.info { "$target exists. Skipping" }
|
||||||
} else {
|
} else {
|
||||||
Log.info { "$target does not exists. Creating" }
|
Log.info { "$target does not exists. Creating" }
|
||||||
val resource = resourceFactory.file(currentFile)
|
val resource = FileResource(currentFile)
|
||||||
processor.process(resource)
|
val progress = ProgressResource(resource, progressBarFactory)
|
||||||
|
processor.process(progress)
|
||||||
Log.info { "File created: $target" }
|
Log.info { "File created: $target" }
|
||||||
}
|
}
|
||||||
currentFile = target.toFile()
|
currentFile = target.toFile()
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
package de.itkl.core_api.interfaces
|
package de.itkl.fileprocessing
|
||||||
|
|
||||||
interface ProgressBarFactory {
|
interface ProgressBarFactory {
|
||||||
fun new(resource: Resource): ProgressBar
|
fun new(resource: Resource): ProgressBar
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
package de.itkl.core_api.implementation
|
package de.itkl.fileprocessing
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.ProgressBar
|
|
||||||
import java.io.InputStream
|
import java.io.InputStream
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -10,10 +9,9 @@ import java.io.InputStream
|
||||||
* @property updateOp The operation to be executed when the number of bytes read changes.
|
* @property updateOp The operation to be executed when the number of bytes read changes.
|
||||||
* @property bytesRead The number of bytes read from the input stream.
|
* @property bytesRead The number of bytes read from the input stream.
|
||||||
*/
|
*/
|
||||||
internal class ProgressInputStream(
|
class ProgressInputStream(
|
||||||
private val inputStream: InputStream,
|
private val inputStream: InputStream,
|
||||||
private val progressBar: ProgressBar
|
private val progressBar: ProgressBar) : InputStream() {
|
||||||
) : InputStream() {
|
|
||||||
@Volatile
|
@Volatile
|
||||||
var bytesRead: Long = 0
|
var bytesRead: Long = 0
|
||||||
private set(value) {
|
private set(value) {
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
package de.itkl.fileprocessing
|
||||||
|
|
||||||
|
import java.io.File
|
||||||
|
import java.io.InputStream
|
||||||
|
import java.nio.file.Files
|
||||||
|
import java.nio.file.Path
|
||||||
|
import kotlin.io.path.name
|
||||||
|
|
||||||
|
interface Resource {
|
||||||
|
val path: Path
|
||||||
|
val size: Long
|
||||||
|
val filename: String
|
||||||
|
fun toFile(): File = path.toFile()
|
||||||
|
|
||||||
|
fun length() = path.toFile().length()
|
||||||
|
|
||||||
|
fun read(): InputStream
|
||||||
|
}
|
||||||
|
|
||||||
|
class ProgressResource(
|
||||||
|
private val resource: Resource,
|
||||||
|
private val progressBarFactory: ProgressBarFactory
|
||||||
|
) : Resource by resource
|
||||||
|
{
|
||||||
|
override fun read(): InputStream {
|
||||||
|
return ProgressInputStream(
|
||||||
|
resource.read(),
|
||||||
|
progressBarFactory.new(this)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class FileResource(override val path: Path) : Resource {
|
||||||
|
constructor(file: File): this(file.toPath())
|
||||||
|
override val size: Long by lazy { path.toFile().length() }
|
||||||
|
override val filename: String
|
||||||
|
get() = path.name
|
||||||
|
|
||||||
|
override fun read(): InputStream {
|
||||||
|
return Files.newInputStream(path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
plugins {
|
|
||||||
kotlin("plugin.serialization") version embeddedKotlinVersion
|
|
||||||
}
|
|
||||||
|
|
||||||
val ktorVersion: String by project
|
|
||||||
|
|
||||||
dependencies {
|
|
||||||
api(project(":libraries:core-api"))
|
|
||||||
|
|
||||||
api("io.ktor:ktor-client-core:$ktorVersion")
|
|
||||||
api("io.ktor:ktor-client-core-jvm:$ktorVersion")
|
|
||||||
implementation("io.ktor:ktor-client-cio:$ktorVersion")
|
|
||||||
implementation("io.ktor:ktor-client-content-negotiation:$ktorVersion")
|
|
||||||
implementation("io.ktor:ktor-serialization-kotlinx-json:$ktorVersion")
|
|
||||||
}
|
|
||||||
|
|
@ -1 +0,0 @@
|
||||||
ktorVersion=2.3.7
|
|
||||||
|
|
@ -1,30 +0,0 @@
|
||||||
package de.itkl.httpClient.clients
|
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.Resource
|
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
|
||||||
import io.ktor.client.*
|
|
||||||
import io.ktor.client.call.*
|
|
||||||
import io.ktor.client.request.*
|
|
||||||
import io.ktor.client.statement.*
|
|
||||||
import io.ktor.http.*
|
|
||||||
import org.koin.core.component.KoinComponent
|
|
||||||
import org.koin.core.component.inject
|
|
||||||
|
|
||||||
private val Log = KotlinLogging.logger { }
|
|
||||||
class MsOcr: KoinComponent {
|
|
||||||
private val httpClient: HttpClient by inject()
|
|
||||||
|
|
||||||
suspend fun ocr(resource: Resource): MsOcrResponse {
|
|
||||||
val response = httpClient.post {
|
|
||||||
url("http://10.54.150.152:5000/vision/v3.2/read/syncAnalyze")
|
|
||||||
parameters {
|
|
||||||
append("language", "de")
|
|
||||||
append("readingOrder", "natural")
|
|
||||||
}
|
|
||||||
contentType(resource.contentType)
|
|
||||||
setBody(resource.read())
|
|
||||||
}
|
|
||||||
println("got response: ${response.status} in ${response.responseTime}")
|
|
||||||
return response.body()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,81 +0,0 @@
|
||||||
package de.itkl.httpClient.clients
|
|
||||||
|
|
||||||
|
|
||||||
import kotlinx.datetime.Instant
|
|
||||||
import kotlinx.datetime.LocalDateTime
|
|
||||||
import kotlinx.serialization.SerialName
|
|
||||||
import kotlinx.serialization.Serializable
|
|
||||||
|
|
||||||
@Serializable
|
|
||||||
data class MsOcrResponse(
|
|
||||||
@SerialName("analyzeResult")
|
|
||||||
val analyzeResult: AnalyzeResult,
|
|
||||||
@SerialName("createdDateTime")
|
|
||||||
val createdDateTime: Instant, // 2023-12-29T21:02:30Z
|
|
||||||
@SerialName("lastUpdatedDateTime")
|
|
||||||
val lastUpdatedDateTime: Instant, // 2023-12-29T21:02:31Z
|
|
||||||
@SerialName("status")
|
|
||||||
val status: String // succeeded
|
|
||||||
) {
|
|
||||||
@Serializable
|
|
||||||
data class AnalyzeResult(
|
|
||||||
@SerialName("modelVersion")
|
|
||||||
val modelVersion: String, // 2022-04-30
|
|
||||||
@SerialName("readResults")
|
|
||||||
val readResults: List<ReadResult>,
|
|
||||||
@SerialName("version")
|
|
||||||
val version: String // 3.2.0
|
|
||||||
) {
|
|
||||||
@Serializable
|
|
||||||
data class ReadResult(
|
|
||||||
@SerialName("angle")
|
|
||||||
val angle: Int, // 0
|
|
||||||
@SerialName("height")
|
|
||||||
val height: Int, // 3507
|
|
||||||
@SerialName("lines")
|
|
||||||
val lines: List<Line>,
|
|
||||||
@SerialName("page")
|
|
||||||
val page: Int, // 1
|
|
||||||
@SerialName("unit")
|
|
||||||
val unit: String, // pixel
|
|
||||||
@SerialName("width")
|
|
||||||
val width: Int // 2481
|
|
||||||
) {
|
|
||||||
@Serializable
|
|
||||||
data class Line(
|
|
||||||
@SerialName("appearance")
|
|
||||||
val appearance: Appearance,
|
|
||||||
@SerialName("boundingBox")
|
|
||||||
val boundingBox: List<Int>,
|
|
||||||
@SerialName("text")
|
|
||||||
val text: String, // Franz Mustermann
|
|
||||||
@SerialName("words")
|
|
||||||
val words: List<Word>
|
|
||||||
) {
|
|
||||||
@Serializable
|
|
||||||
data class Appearance(
|
|
||||||
@SerialName("style")
|
|
||||||
val style: Style
|
|
||||||
) {
|
|
||||||
@Serializable
|
|
||||||
data class Style(
|
|
||||||
@SerialName("confidence")
|
|
||||||
val confidence: Double, // 0.972
|
|
||||||
@SerialName("name")
|
|
||||||
val name: String // other
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
@Serializable
|
|
||||||
data class Word(
|
|
||||||
@SerialName("boundingBox")
|
|
||||||
val boundingBox: List<Int>,
|
|
||||||
@SerialName("confidence")
|
|
||||||
val confidence: Double, // 0.998
|
|
||||||
@SerialName("text")
|
|
||||||
val text: String // Franz
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
package de.itkl.httpClient
|
|
||||||
|
|
||||||
import io.ktor.client.*
|
|
||||||
import io.ktor.client.engine.cio.*
|
|
||||||
import io.ktor.client.plugins.contentnegotiation.*
|
|
||||||
import io.ktor.serialization.kotlinx.json.*
|
|
||||||
|
|
||||||
fun createHttpClient(): HttpClient {
|
|
||||||
return HttpClient(CIO) {
|
|
||||||
install(ContentNegotiation) {
|
|
||||||
json()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
package de.itkl.httpClient
|
|
||||||
|
|
||||||
import de.itkl.httpClient.clients.MsOcr
|
|
||||||
import io.ktor.client.*
|
|
||||||
import org.koin.dsl.module
|
|
||||||
|
|
||||||
val httpClientModule = module {
|
|
||||||
single<HttpClient> { createHttpClient() }
|
|
||||||
single<MsOcr> { MsOcr() }
|
|
||||||
}
|
|
||||||
|
|
@ -1,36 +0,0 @@
|
||||||
package de.itkl.httpClient.clients
|
|
||||||
|
|
||||||
import de.itkl.core_api.coreApiModule
|
|
||||||
import de.itkl.core_api.implementation.FileResource
|
|
||||||
import de.itkl.core_api.interfaces.Resource
|
|
||||||
import de.itkl.httpClient.httpClientModule
|
|
||||||
import kotlinx.coroutines.runBlocking
|
|
||||||
import org.junit.Rule
|
|
||||||
import org.junit.jupiter.api.BeforeEach
|
|
||||||
import org.junit.jupiter.api.Test
|
|
||||||
import org.koin.core.component.inject
|
|
||||||
import org.koin.core.context.startKoin
|
|
||||||
import org.koin.test.KoinTest
|
|
||||||
import java.nio.file.Paths
|
|
||||||
|
|
||||||
class MsOcrTest : KoinTest {
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
fun start() {
|
|
||||||
startKoin {
|
|
||||||
printLogger()
|
|
||||||
modules(
|
|
||||||
coreApiModule,
|
|
||||||
httpClientModule)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
fun `can create a request`() = runBlocking {
|
|
||||||
val msOcrClient: MsOcr by inject()
|
|
||||||
val resource = FileResource(Paths.get("../../assets/xs-reg/00001.jpg").toAbsolutePath())
|
|
||||||
val response = msOcrClient.ocr(resource)
|
|
||||||
println(response)
|
|
||||||
Unit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
dependencies {
|
dependencies {
|
||||||
api(project(":libraries:core-api"))
|
|
||||||
api("org.apache.lucene:lucene-analysis-common:9.9.0")
|
api("org.apache.lucene:lucene-analysis-common:9.9.0")
|
||||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||||
implementation("com.google.guava:guava:32.1.3-jre")
|
implementation("com.google.guava:guava:32.1.3-jre")
|
||||||
|
|
|
||||||
|
|
@ -2,16 +2,30 @@ package de.itkl.textprocessing
|
||||||
|
|
||||||
import kotlinx.coroutines.flow.*
|
import kotlinx.coroutines.flow.*
|
||||||
|
|
||||||
class Histogram(
|
class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : Iterable<Pair<String, UInt>>{
|
||||||
private val histo: MutableMap<String,UInt> = mutableMapOf()
|
|
||||||
) : Iterable<Pair<String, UInt>>{
|
|
||||||
companion object {
|
companion object {
|
||||||
|
suspend fun from(flow: Flow<String>): Histogram {
|
||||||
|
return Histogram().apply {
|
||||||
|
flow.collect(this::add)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fun fromBagOfWords(bagOfWords: BagOfWords): Histogram {
|
fun fromBagOfWords(bagOfWords: BagOfWords): Histogram {
|
||||||
val result = Histogram()
|
val result = Histogram()
|
||||||
bagOfWords.forEach(result::add)
|
bagOfWords.forEach(result::add)
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
|
||||||
|
val result = Histogram()
|
||||||
|
flow.collect() { value ->
|
||||||
|
value.forEach(result::add)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
fun from(sequence: Sequence<Map<String, String>>): Histogram {
|
fun from(sequence: Sequence<Map<String, String>>): Histogram {
|
||||||
val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
|
val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
|
||||||
.toMutableMap()
|
.toMutableMap()
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@ package de.itkl.textprocessing
|
||||||
|
|
||||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
|
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
|
||||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||||
import de.itkl.core_api.interfaces.Resource
|
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
|
@ -17,9 +16,9 @@ class HistogramCsvStorage {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
suspend fun read(resource: Resource): Histogram {
|
suspend fun read(file: File): Histogram {
|
||||||
return csvReader { }
|
return csvReader { }
|
||||||
.openAsync(resource.read()) {
|
.openAsync(file) {
|
||||||
val sequence = readAllWithHeaderAsSequence()
|
val sequence = readAllWithHeaderAsSequence()
|
||||||
Histogram.from(sequence)
|
Histogram.from(sequence)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
dependencies {
|
dependencies {
|
||||||
api(project(":libraries:textprocessing"))
|
api(project(":libraries:textprocessing"))
|
||||||
api(project(":libraries:fileprocessing"))
|
api(project(":libraries:fileprocessing"))
|
||||||
api(project(":libraries:core-api"))
|
|
||||||
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
||||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||||
implementation("com.google.guava:guava:32.1.3-jre")
|
implementation("com.google.guava:guava:32.1.3-jre")
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
package de.itkl.tfidf
|
package de.itkl.tfidf
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.FileProcessor
|
import de.itkl.fileprocessing.FileProcessor
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.fileprocessing.Resource
|
||||||
import de.itkl.processing.parallelUnordered
|
import de.itkl.processing.parallelUnordered
|
||||||
import de.itkl.textprocessing.*
|
import de.itkl.textprocessing.*
|
||||||
import de.itkl.textprocessing.interfaces.Stemmer
|
import de.itkl.textprocessing.interfaces.Stemmer
|
||||||
|
|
@ -24,8 +24,8 @@ class DocumentFrequency : FileProcessor, KoinComponent {
|
||||||
}
|
}
|
||||||
|
|
||||||
override suspend fun process(resource: Resource): File = coroutineScope {
|
override suspend fun process(resource: Resource): File = coroutineScope {
|
||||||
Log.info { "Would produce: ${willProduce(resource.path!!)}" }
|
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
||||||
val resultFile = willProduce(resource.path!!).toFile()
|
val resultFile = willProduce(resource.path).toFile()
|
||||||
val (numDocs, histogram) = TextFile(resource.read())
|
val (numDocs, histogram) = TextFile(resource.read())
|
||||||
.splitByEmptyLines()
|
.splitByEmptyLines()
|
||||||
.withIndex()
|
.withIndex()
|
||||||
|
|
|
||||||
|
|
@ -1,39 +1,43 @@
|
||||||
package de.itkl.tfidf
|
package de.itkl.tfidf
|
||||||
|
|
||||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||||
import de.itkl.core_api.interfaces.FileProcessor
|
import de.itkl.fileprocessing.FileProcessor
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
import de.itkl.core_api.interfaces.ProgressBarFactory
|
import de.itkl.fileprocessing.Resource
|
||||||
import de.itkl.textprocessing.HistogramCsvStorage
|
import de.itkl.textprocessing.HistogramCsvStorage
|
||||||
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
import org.koin.core.component.inject
|
import org.koin.core.component.inject
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.nameWithoutExtension
|
import kotlin.io.path.nameWithoutExtension
|
||||||
|
import kotlin.math.ln
|
||||||
|
import kotlin.math.log
|
||||||
import kotlin.math.log10
|
import kotlin.math.log10
|
||||||
|
import kotlin.math.log2
|
||||||
|
|
||||||
|
private val Log = KotlinLogging.logger { }
|
||||||
|
|
||||||
class InverseDocumentFrequency : FileProcessor, KoinComponent {
|
class InverseDocumentFrequency : FileProcessor, KoinComponent {
|
||||||
override fun willProduce(path: Path): Path {
|
override fun willProduce(path: Path): Path {
|
||||||
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
|
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
override suspend fun process(resource: Resource): File {
|
override suspend fun process(resource: Resource): File {
|
||||||
val histogram = HistogramCsvStorage().read(resource)
|
val histogram = HistogramCsvStorage().read(resource.toFile())
|
||||||
val numDocs = histogram
|
val numDocs = histogram
|
||||||
.find { (word, _) -> word == "\$numDocs" }!!
|
.find { (word, count) -> word == "\$numDocs" }!!
|
||||||
.second.toInt()
|
.second.toInt()
|
||||||
val progressBarFactory: ProgressBarFactory by inject()
|
val progressBarFactory: ProgressBarFactory by inject()
|
||||||
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progress ->
|
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
|
||||||
csvWriter().openAsync(willProduce(resource.path!!).toFile(), append = false) {
|
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
|
||||||
writeRow("word", "idf")
|
writeRow("word", "idf")
|
||||||
histogram.forEach { (word, count) ->
|
histogram.forEach { (word, count) ->
|
||||||
writeRow(word, idf(numDocs, count))
|
writeRow(word, idf(numDocs, count))
|
||||||
progress.step()
|
progess.step()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
resource.path!!.toFile()
|
resource.path.toFile()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,11 @@
|
||||||
package de.itkl.tui.implementation
|
package de.itkl.tfidf
|
||||||
|
|
||||||
import com.github.ajalt.mordant.animation.ProgressAnimation
|
import com.github.ajalt.mordant.animation.ProgressAnimation
|
||||||
import com.github.ajalt.mordant.animation.progressAnimation
|
import com.github.ajalt.mordant.animation.progressAnimation
|
||||||
import com.github.ajalt.mordant.terminal.Terminal
|
import com.github.ajalt.mordant.terminal.Terminal
|
||||||
import de.itkl.core_api.interfaces.Resource
|
import de.itkl.fileprocessing.ProgressBar
|
||||||
import de.itkl.core_api.interfaces.ProgressBar
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
import de.itkl.core_api.interfaces.ProgressBarFactory
|
import de.itkl.fileprocessing.Resource
|
||||||
|
|
||||||
class TerminalProgressBarFactory : ProgressBarFactory {
|
class TerminalProgressBarFactory : ProgressBarFactory {
|
||||||
private val terminal = Terminal()
|
private val terminal = Terminal()
|
||||||
|
|
@ -17,7 +17,7 @@ class TerminalProgressBarFactory : ProgressBarFactory {
|
||||||
completed()
|
completed()
|
||||||
timeRemaining()
|
timeRemaining()
|
||||||
}
|
}
|
||||||
return TerminalProgressBar(animation, resource.length!!)
|
return TerminalProgressBar(animation, resource.length())
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun new(name: String, max: Long): ProgressBar {
|
override fun new(name: String, max: Long): ProgressBar {
|
||||||
|
|
@ -1,7 +1,9 @@
|
||||||
package de.itkl.tfidf
|
package de.itkl.tfidf
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.FileProcessor
|
|
||||||
import de.itkl.fileprocessing.FileProcessingPipeline
|
import de.itkl.fileprocessing.FileProcessingPipeline
|
||||||
|
import de.itkl.fileprocessing.FileProcessor
|
||||||
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
|
import org.koin.core.component.KoinComponent
|
||||||
|
|
||||||
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
||||||
override val fileProcessor = listOf<FileProcessor>(
|
override val fileProcessor = listOf<FileProcessor>(
|
||||||
|
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
dependencies {
|
|
||||||
api(project(":libraries:core-api"))
|
|
||||||
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
|
||||||
}
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
package de.itkl.tui.implementation
|
|
||||||
|
|
||||||
class TerminalDataTableReporter {
|
|
||||||
}
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
package de.itkl.tui
|
|
||||||
|
|
||||||
import de.itkl.core_api.interfaces.ProgressBarFactory
|
|
||||||
import de.itkl.tui.implementation.TerminalProgressBarFactory
|
|
||||||
import org.koin.dsl.module
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add terminal ui capabilities
|
|
||||||
*/
|
|
||||||
val tuiModule = module {
|
|
||||||
single<ProgressBarFactory> {
|
|
||||||
TerminalProgressBarFactory()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue