Compare commits

...

6 Commits

Author SHA1 Message Date
Timo Bryant 9f3813a83a starting with ms ocr client 2023-12-29 22:20:33 +01:00
Timo Bryant 30dc3b658d cleanup 2023-12-29 20:45:30 +01:00
Timo Bryant 6fb0ce2a4f move stuff to core-io/tui 2023-12-29 20:42:06 +01:00
Timo Bryant d62aadb95f move ProgressBarFactory to core api 2023-12-27 16:28:51 +01:00
Timo Bryant f777669dfa 7 Add TUI module 2023-12-27 16:16:34 +01:00
Timo Bryant cc727c681a adding core api 2023-12-27 16:11:12 +01:00
38 changed files with 398 additions and 123 deletions

View File

@ -0,0 +1,36 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="docthor [clean]" type="GradleRunConfiguration" factoryName="Gradle" nameIsGenerated="true">
<ExternalSystemSettings>
<option name="executionName" />
<option name="externalProjectPath" value="$PROJECT_DIR$" />
<option name="externalSystemIdString" value="GRADLE" />
<option name="scriptParameters" value="" />
<option name="taskDescriptions">
<list />
</option>
<option name="taskNames">
<list>
<option value="clean" />
</list>
</option>
<option name="vmOptions" />
</ExternalSystemSettings>
<ExternalSystemDebugServerProcess>true</ExternalSystemDebugServerProcess>
<ExternalSystemReattachDebugProcess>true</ExternalSystemReattachDebugProcess>
<EXTENSION ID="com.intellij.execution.ExternalSystemRunConfigurationJavaExtension">
<extension name="net.ashald.envfile">
<option name="IS_ENABLED" value="false" />
<option name="IS_SUBST" value="false" />
<option name="IS_PATH_MACRO_SUPPORTED" value="false" />
<option name="IS_IGNORE_MISSING_FILES" value="false" />
<option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
<ENTRIES>
<ENTRY IS_ENABLED="true" PARSER="runconfig" IS_EXECUTABLE="false" />
</ENTRIES>
</extension>
</EXTENSION>
<DebugAllEnabled>false</DebugAllEnabled>
<RunAsTest>false</RunAsTest>
<method v="2" />
</configuration>
</component>

View File

@ -4,7 +4,7 @@
<instance-profile id="d"
name="Docthor"
start-page="starter-topic.md">
start-page="docthor.md">
<toc-element topic="starter-topic.md"/>
<toc-element topic="docthor.md"/>
</instance-profile>

View File

@ -21,4 +21,12 @@ All libraries should be placed unter <path>libraries</path>
<def title="io">
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
</def>
<def title="core-api">
Defines the core interfaces
</def>
<def title="tui">
Provides tui capabilities. When applied as koin modules
the resources will automatically print a read/write progressbar
on terminal.
</def>
</deflist>

View File

@ -4,6 +4,7 @@ plugins {
dependencies {
implementation(project(":libraries:tfidf"))
implementation(project(":libraries:tui"))
}
application {

View File

@ -6,15 +6,13 @@ import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.required
import com.github.ajalt.clikt.parameters.types.enum
import com.github.ajalt.clikt.parameters.types.file
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.core_api.coreApiModule
import de.itkl.textprocessing.textProcessingModule
import de.itkl.tfidf.Language
import de.itkl.tfidf.TerminalProgressBarFactory
//import de.itkl.tfidf.TfIdf
import de.itkl.tfidf.TfIdfPipeline
import de.itkl.tui.tuiModule
import kotlinx.coroutines.runBlocking
import org.koin.core.context.startKoin
import org.koin.dsl.module
class ComputeIdf : CliktCommand() {
private val corpus by option(help = "corpus")
@ -33,12 +31,9 @@ class ComputeIdf : CliktCommand() {
fun main(args: Array<String>) {
startKoin {
modules(
coreApiModule,
textProcessingModule,
module {
single<ProgressBarFactory> {
TerminalProgressBarFactory()
}
})
tuiModule)
ComputeIdf().main(args)
}
}

View File

@ -7,5 +7,5 @@ repositories {
}
dependencies {
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.20")
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:$embeddedKotlinVersion")
}

View File

@ -13,6 +13,11 @@ dependencies {
val koin_version = "3.5.3"
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
implementation("io.insert-koin:koin-core:$koin_version")
implementation("org.jetbrains.kotlinx:kotlinx-datetime:0.5.0")
implementation("org.jetbrains.kotlinx:kotlinx-serialization-json:1.6.2")
testImplementation("io.insert-koin:koin-test:$koin_version")
}
java {

View File

@ -1,6 +0,0 @@
package de.itkl.clients
class MsOcr {
suspend fun ocr() {}
}

View File

@ -0,0 +1,11 @@
package de.itkl.core_api
import de.itkl.core_api.interfaces.NoopResourceReadDecorator
import de.itkl.core_api.interfaces.ResourceFactory
import de.itkl.core_api.interfaces.ResourceReadDecorator
import org.koin.dsl.module
val coreApiModule = module {
single<ResourceFactory> { ResourceFactory()}
single<ResourceReadDecorator> { NoopResourceReadDecorator() }
}

View File

@ -0,0 +1,24 @@
package de.itkl.core_api.implementation
import de.itkl.core_api.interfaces.AbstractResource
import io.ktor.http.*
import java.io.File
import java.io.InputStream
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.name
class FileResource(override val path: Path) : AbstractResource() {
constructor(file: File): this(file.toPath())
override val length: Long by lazy { path.toFile().length() }
override val file: File?
get() = path.toFile()
override fun doRead(): InputStream {
return Files.newInputStream(path)
}
override val filename: String
get() = path.name
override val contentType: ContentType
get() = ContentType.fromFilePath(path.name).first()
}

View File

@ -1,5 +1,6 @@
package de.itkl.fileprocessing
package de.itkl.core_api.implementation
import de.itkl.core_api.interfaces.ProgressBar
import java.io.InputStream
/**
@ -9,9 +10,10 @@ import java.io.InputStream
* @property updateOp The operation to be executed when the number of bytes read changes.
* @property bytesRead The number of bytes read from the input stream.
*/
class ProgressInputStream(
internal class ProgressInputStream(
private val inputStream: InputStream,
private val progressBar: ProgressBar) : InputStream() {
private val progressBar: ProgressBar
) : InputStream() {
@Volatile
var bytesRead: Long = 0
private set(value) {

View File

@ -0,0 +1,19 @@
package de.itkl.core_api.implementation
import de.itkl.core_api.implementation.ProgressInputStream
import de.itkl.core_api.interfaces.ProgressBarFactory
import de.itkl.core_api.interfaces.Resource
import java.io.InputStream
internal class ProgressResource(
private val resource: Resource,
private val progressBarFactory: ProgressBarFactory
) : Resource by resource
{
override fun read(): InputStream {
return ProgressInputStream(
resource.read(),
progressBarFactory.new(this)
)
}
}

View File

@ -1,7 +1,6 @@
package de.itkl.core_api.interfaces
import java.io.File
import java.io.InputStream
import java.nio.file.Path
interface FileProcessor {

View File

@ -1,4 +1,4 @@
package de.itkl.fileprocessing
package de.itkl.core_api.interfaces
interface ProgressBarFactory {
fun new(resource: Resource): ProgressBar

View File

@ -3,20 +3,32 @@ package de.itkl.core_api.interfaces
import io.ktor.http.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.get
import java.io.File
import java.io.InputStream
import java.nio.file.Path
abstract class Resource : KoinComponent {
abstract val filename: String
abstract val contentType: ContentType
abstract val length: Long?
interface Resource {
val filename: String
val contentType: ContentType
// TODO: Find a better method to avoid those nulls. Maybe subtyping the interface
val length: Long?
val file: File?
val path: Path?
fun read(): InputStream
}
protected abstract fun doRead(): InputStream
fun read(): InputStream {
/**
* Automatically adds koin injectable decorators to reading/writing
* operations
*/
abstract class AbstractResource : Resource, KoinComponent {
abstract fun doRead(): InputStream
final override fun read(): InputStream {
return length?.let { length ->
get<ResourceReadDecorator>().decorate(
length = length,
read()
doRead()
)
} ?: read()
} ?: doRead()
}
}

View File

@ -0,0 +1,16 @@
package de.itkl.core_api.interfaces
import de.itkl.core_api.implementation.FileResource
import de.itkl.core_api.implementation.ProgressResource
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
class ResourceFactory : KoinComponent {
private val progressBarFactory by inject<ProgressBarFactory>()
fun file(file: File): Resource {
val resource = FileResource(file)
return ProgressResource(resource, progressBarFactory)
}
}

View File

@ -0,0 +1,4 @@
package de.itkl.core_api.interfaces.data
interface DataTable : Iterable<List<String>> {
val columns: List<String>
}

View File

@ -1,6 +1,7 @@
package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.ResourceFactory
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
@ -10,10 +11,9 @@ import kotlin.io.path.exists
private val Log = KotlinLogging.logger { }
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
private val resourceFactory: ResourceFactory by inject()
protected abstract val fileProcessor: List<FileProcessor>
private val progressBarFactory: ProgressBarFactory by inject()
suspend fun input(file: File) {
var currentFile = file
fileProcessor.forEach { processor ->
@ -22,9 +22,8 @@ abstract class FileProcessingPipeline(private val force: Boolean = false) : Koin
Log.info { "$target exists. Skipping" }
} else {
Log.info { "$target does not exists. Creating" }
val resource = FileResource(currentFile)
val progress = ProgressResource(resource, progressBarFactory)
processor.process(progress)
val resource = resourceFactory.file(currentFile)
processor.process(resource)
Log.info { "File created: $target" }
}
currentFile = target.toFile()

View File

@ -1,42 +0,0 @@
package de.itkl.fileprocessing
import java.io.File
import java.io.InputStream
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.name
interface Resource {
val path: Path
val size: Long
val filename: String
fun toFile(): File = path.toFile()
fun length() = path.toFile().length()
fun read(): InputStream
}
class ProgressResource(
private val resource: Resource,
private val progressBarFactory: ProgressBarFactory
) : Resource by resource
{
override fun read(): InputStream {
return ProgressInputStream(
resource.read(),
progressBarFactory.new(this)
)
}
}
class FileResource(override val path: Path) : Resource {
constructor(file: File): this(file.toPath())
override val size: Long by lazy { path.toFile().length() }
override val filename: String
get() = path.name
override fun read(): InputStream {
return Files.newInputStream(path)
}
}

View File

@ -0,0 +1,15 @@
plugins {
kotlin("plugin.serialization") version embeddedKotlinVersion
}
val ktorVersion: String by project
dependencies {
api(project(":libraries:core-api"))
api("io.ktor:ktor-client-core:$ktorVersion")
api("io.ktor:ktor-client-core-jvm:$ktorVersion")
implementation("io.ktor:ktor-client-cio:$ktorVersion")
implementation("io.ktor:ktor-client-content-negotiation:$ktorVersion")
implementation("io.ktor:ktor-serialization-kotlinx-json:$ktorVersion")
}

View File

@ -0,0 +1 @@
ktorVersion=2.3.7

View File

@ -0,0 +1,30 @@
package de.itkl.httpClient.clients
import de.itkl.core_api.interfaces.Resource
import io.github.oshai.kotlinlogging.KotlinLogging
import io.ktor.client.*
import io.ktor.client.call.*
import io.ktor.client.request.*
import io.ktor.client.statement.*
import io.ktor.http.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
private val Log = KotlinLogging.logger { }
class MsOcr: KoinComponent {
private val httpClient: HttpClient by inject()
suspend fun ocr(resource: Resource): MsOcrResponse {
val response = httpClient.post {
url("http://10.54.150.152:5000/vision/v3.2/read/syncAnalyze")
parameters {
append("language", "de")
append("readingOrder", "natural")
}
contentType(resource.contentType)
setBody(resource.read())
}
println("got response: ${response.status} in ${response.responseTime}")
return response.body()
}
}

View File

@ -0,0 +1,81 @@
package de.itkl.httpClient.clients
import kotlinx.datetime.Instant
import kotlinx.datetime.LocalDateTime
import kotlinx.serialization.SerialName
import kotlinx.serialization.Serializable
@Serializable
data class MsOcrResponse(
@SerialName("analyzeResult")
val analyzeResult: AnalyzeResult,
@SerialName("createdDateTime")
val createdDateTime: Instant, // 2023-12-29T21:02:30Z
@SerialName("lastUpdatedDateTime")
val lastUpdatedDateTime: Instant, // 2023-12-29T21:02:31Z
@SerialName("status")
val status: String // succeeded
) {
@Serializable
data class AnalyzeResult(
@SerialName("modelVersion")
val modelVersion: String, // 2022-04-30
@SerialName("readResults")
val readResults: List<ReadResult>,
@SerialName("version")
val version: String // 3.2.0
) {
@Serializable
data class ReadResult(
@SerialName("angle")
val angle: Int, // 0
@SerialName("height")
val height: Int, // 3507
@SerialName("lines")
val lines: List<Line>,
@SerialName("page")
val page: Int, // 1
@SerialName("unit")
val unit: String, // pixel
@SerialName("width")
val width: Int // 2481
) {
@Serializable
data class Line(
@SerialName("appearance")
val appearance: Appearance,
@SerialName("boundingBox")
val boundingBox: List<Int>,
@SerialName("text")
val text: String, // Franz Mustermann
@SerialName("words")
val words: List<Word>
) {
@Serializable
data class Appearance(
@SerialName("style")
val style: Style
) {
@Serializable
data class Style(
@SerialName("confidence")
val confidence: Double, // 0.972
@SerialName("name")
val name: String // other
)
}
@Serializable
data class Word(
@SerialName("boundingBox")
val boundingBox: List<Int>,
@SerialName("confidence")
val confidence: Double, // 0.998
@SerialName("text")
val text: String // Franz
)
}
}
}
}

View File

@ -0,0 +1,14 @@
package de.itkl.httpClient
import io.ktor.client.*
import io.ktor.client.engine.cio.*
import io.ktor.client.plugins.contentnegotiation.*
import io.ktor.serialization.kotlinx.json.*
fun createHttpClient(): HttpClient {
return HttpClient(CIO) {
install(ContentNegotiation) {
json()
}
}
}

View File

@ -0,0 +1,10 @@
package de.itkl.httpClient
import de.itkl.httpClient.clients.MsOcr
import io.ktor.client.*
import org.koin.dsl.module
val httpClientModule = module {
single<HttpClient> { createHttpClient() }
single<MsOcr> { MsOcr() }
}

View File

@ -0,0 +1,36 @@
package de.itkl.httpClient.clients
import de.itkl.core_api.coreApiModule
import de.itkl.core_api.implementation.FileResource
import de.itkl.core_api.interfaces.Resource
import de.itkl.httpClient.httpClientModule
import kotlinx.coroutines.runBlocking
import org.junit.Rule
import org.junit.jupiter.api.BeforeEach
import org.junit.jupiter.api.Test
import org.koin.core.component.inject
import org.koin.core.context.startKoin
import org.koin.test.KoinTest
import java.nio.file.Paths
class MsOcrTest : KoinTest {
@BeforeEach
fun start() {
startKoin {
printLogger()
modules(
coreApiModule,
httpClientModule)
}
}
@Test
fun `can create a request`() = runBlocking {
val msOcrClient: MsOcr by inject()
val resource = FileResource(Paths.get("../../assets/xs-reg/00001.jpg").toAbsolutePath())
val response = msOcrClient.ocr(resource)
println(response)
Unit
}
}

View File

@ -1,4 +1,5 @@
dependencies {
api(project(":libraries:core-api"))
api("org.apache.lucene:lucene-analysis-common:9.9.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")

View File

@ -2,30 +2,16 @@ package de.itkl.textprocessing
import kotlinx.coroutines.flow.*
class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : Iterable<Pair<String, UInt>>{
class Histogram(
private val histo: MutableMap<String,UInt> = mutableMapOf()
) : Iterable<Pair<String, UInt>>{
companion object {
suspend fun from(flow: Flow<String>): Histogram {
return Histogram().apply {
flow.collect(this::add)
}
}
fun fromBagOfWords(bagOfWords: BagOfWords): Histogram {
val result = Histogram()
bagOfWords.forEach(result::add)
return result
}
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
val result = Histogram()
flow.collect() { value ->
value.forEach(result::add)
}
return result
}
fun from(sequence: Sequence<Map<String, String>>): Histogram {
val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
.toMutableMap()

View File

@ -2,6 +2,7 @@ package de.itkl.textprocessing
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.core_api.interfaces.Resource
import java.io.File
import java.nio.file.Path
@ -16,9 +17,9 @@ class HistogramCsvStorage {
}
}
}
suspend fun read(file: File): Histogram {
suspend fun read(resource: Resource): Histogram {
return csvReader { }
.openAsync(file) {
.openAsync(resource.read()) {
val sequence = readAllWithHeaderAsSequence()
Histogram.from(sequence)
}

View File

@ -1,6 +1,7 @@
dependencies {
api(project(":libraries:textprocessing"))
api(project(":libraries:fileprocessing"))
api(project(":libraries:core-api"))
implementation("com.github.ajalt.mordant:mordant:2.2.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")

View File

@ -1,7 +1,7 @@
package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.Resource
import de.itkl.processing.parallelUnordered
import de.itkl.textprocessing.*
import de.itkl.textprocessing.interfaces.Stemmer
@ -24,8 +24,8 @@ class DocumentFrequency : FileProcessor, KoinComponent {
}
override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile()
Log.info { "Would produce: ${willProduce(resource.path!!)}" }
val resultFile = willProduce(resource.path!!).toFile()
val (numDocs, histogram) = TextFile(resource.read())
.splitByEmptyLines()
.withIndex()

View File

@ -1,43 +1,39 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.ProgressBarFactory
import de.itkl.textprocessing.HistogramCsvStorage
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.ln
import kotlin.math.log
import kotlin.math.log10
import kotlin.math.log2
private val Log = KotlinLogging.logger { }
class InverseDocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
}
override suspend fun process(resource: Resource): File {
val histogram = HistogramCsvStorage().read(resource.toFile())
val histogram = HistogramCsvStorage().read(resource)
val numDocs = histogram
.find { (word, count) -> word == "\$numDocs" }!!
.find { (word, _) -> word == "\$numDocs" }!!
.second.toInt()
val progressBarFactory: ProgressBarFactory by inject()
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progress ->
csvWriter().openAsync(willProduce(resource.path!!).toFile(), append = false) {
writeRow("word", "idf")
histogram.forEach { (word, count) ->
writeRow(word, idf(numDocs, count))
progess.step()
progress.step()
}
}
resource.path.toFile()
resource.path!!.toFile()
}
}

View File

@ -1,9 +1,7 @@
package de.itkl.tfidf
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
override val fileProcessor = listOf<FileProcessor>(

View File

@ -0,0 +1,4 @@
dependencies {
api(project(":libraries:core-api"))
implementation("com.github.ajalt.mordant:mordant:2.2.0")
}

View File

@ -0,0 +1,4 @@
package de.itkl.tui.implementation
class TerminalDataTableReporter {
}

View File

@ -1,11 +1,11 @@
package de.itkl.tfidf
package de.itkl.tui.implementation
import com.github.ajalt.mordant.animation.ProgressAnimation
import com.github.ajalt.mordant.animation.progressAnimation
import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.fileprocessing.ProgressBar
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.ProgressBar
import de.itkl.core_api.interfaces.ProgressBarFactory
class TerminalProgressBarFactory : ProgressBarFactory {
private val terminal = Terminal()
@ -17,7 +17,7 @@ class TerminalProgressBarFactory : ProgressBarFactory {
completed()
timeRemaining()
}
return TerminalProgressBar(animation, resource.length())
return TerminalProgressBar(animation, resource.length!!)
}
override fun new(name: String, max: Long): ProgressBar {

View File

@ -0,0 +1,14 @@
package de.itkl.tui
import de.itkl.core_api.interfaces.ProgressBarFactory
import de.itkl.tui.implementation.TerminalProgressBarFactory
import org.koin.dsl.module
/**
* Add terminal ui capabilities
*/
val tuiModule = module {
single<ProgressBarFactory> {
TerminalProgressBarFactory()
}
}