Compare commits

..

No commits in common. "9f3813a83abe26d9a0da9f3402ae0a233b100597" and "2deaa204c5ef3d10db58047ccf7c1c2121b29fdd" have entirely different histories.

38 changed files with 123 additions and 398 deletions

View File

@ -1,36 +0,0 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="docthor [clean]" type="GradleRunConfiguration" factoryName="Gradle" nameIsGenerated="true">
<ExternalSystemSettings>
<option name="executionName" />
<option name="externalProjectPath" value="$PROJECT_DIR$" />
<option name="externalSystemIdString" value="GRADLE" />
<option name="scriptParameters" value="" />
<option name="taskDescriptions">
<list />
</option>
<option name="taskNames">
<list>
<option value="clean" />
</list>
</option>
<option name="vmOptions" />
</ExternalSystemSettings>
<ExternalSystemDebugServerProcess>true</ExternalSystemDebugServerProcess>
<ExternalSystemReattachDebugProcess>true</ExternalSystemReattachDebugProcess>
<EXTENSION ID="com.intellij.execution.ExternalSystemRunConfigurationJavaExtension">
<extension name="net.ashald.envfile">
<option name="IS_ENABLED" value="false" />
<option name="IS_SUBST" value="false" />
<option name="IS_PATH_MACRO_SUPPORTED" value="false" />
<option name="IS_IGNORE_MISSING_FILES" value="false" />
<option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
<ENTRIES>
<ENTRY IS_ENABLED="true" PARSER="runconfig" IS_EXECUTABLE="false" />
</ENTRIES>
</extension>
</EXTENSION>
<DebugAllEnabled>false</DebugAllEnabled>
<RunAsTest>false</RunAsTest>
<method v="2" />
</configuration>
</component>

View File

@ -4,7 +4,7 @@
<instance-profile id="d"
name="Docthor"
start-page="docthor.md">
start-page="starter-topic.md">
<toc-element topic="docthor.md"/>
<toc-element topic="starter-topic.md"/>
</instance-profile>

View File

@ -21,12 +21,4 @@ All libraries should be placed unter <path>libraries</path>
<def title="io">
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
</def>
<def title="core-api">
Defines the core interfaces
</def>
<def title="tui">
Provides tui capabilities. When applied as koin modules
the resources will automatically print a read/write progressbar
on terminal.
</def>
</deflist>

View File

@ -4,7 +4,6 @@ plugins {
dependencies {
implementation(project(":libraries:tfidf"))
implementation(project(":libraries:tui"))
}
application {

View File

@ -6,13 +6,15 @@ import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.required
import com.github.ajalt.clikt.parameters.types.enum
import com.github.ajalt.clikt.parameters.types.file
import de.itkl.core_api.coreApiModule
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.textprocessing.textProcessingModule
import de.itkl.tfidf.Language
import de.itkl.tfidf.TerminalProgressBarFactory
//import de.itkl.tfidf.TfIdf
import de.itkl.tfidf.TfIdfPipeline
import de.itkl.tui.tuiModule
import kotlinx.coroutines.runBlocking
import org.koin.core.context.startKoin
import org.koin.dsl.module
class ComputeIdf : CliktCommand() {
private val corpus by option(help = "corpus")
@ -31,9 +33,12 @@ class ComputeIdf : CliktCommand() {
fun main(args: Array<String>) {
startKoin {
modules(
coreApiModule,
textProcessingModule,
tuiModule)
module {
single<ProgressBarFactory> {
TerminalProgressBarFactory()
}
})
ComputeIdf().main(args)
}
}

View File

@ -7,5 +7,5 @@ repositories {
}
dependencies {
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:$embeddedKotlinVersion")
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.20")
}

View File

@ -13,11 +13,6 @@ dependencies {
val koin_version = "3.5.3"
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
implementation("io.insert-koin:koin-core:$koin_version")
implementation("org.jetbrains.kotlinx:kotlinx-datetime:0.5.0")
implementation("org.jetbrains.kotlinx:kotlinx-serialization-json:1.6.2")
testImplementation("io.insert-koin:koin-test:$koin_version")
}
java {

View File

View File

@ -0,0 +1,6 @@
package de.itkl.clients
class MsOcr {
suspend fun ocr() {}
}

View File

@ -1,11 +0,0 @@
package de.itkl.core_api
import de.itkl.core_api.interfaces.NoopResourceReadDecorator
import de.itkl.core_api.interfaces.ResourceFactory
import de.itkl.core_api.interfaces.ResourceReadDecorator
import org.koin.dsl.module
val coreApiModule = module {
single<ResourceFactory> { ResourceFactory()}
single<ResourceReadDecorator> { NoopResourceReadDecorator() }
}

View File

@ -1,24 +0,0 @@
package de.itkl.core_api.implementation
import de.itkl.core_api.interfaces.AbstractResource
import io.ktor.http.*
import java.io.File
import java.io.InputStream
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.name
class FileResource(override val path: Path) : AbstractResource() {
constructor(file: File): this(file.toPath())
override val length: Long by lazy { path.toFile().length() }
override val file: File?
get() = path.toFile()
override fun doRead(): InputStream {
return Files.newInputStream(path)
}
override val filename: String
get() = path.name
override val contentType: ContentType
get() = ContentType.fromFilePath(path.name).first()
}

View File

@ -1,19 +0,0 @@
package de.itkl.core_api.implementation
import de.itkl.core_api.implementation.ProgressInputStream
import de.itkl.core_api.interfaces.ProgressBarFactory
import de.itkl.core_api.interfaces.Resource
import java.io.InputStream
internal class ProgressResource(
private val resource: Resource,
private val progressBarFactory: ProgressBarFactory
) : Resource by resource
{
override fun read(): InputStream {
return ProgressInputStream(
resource.read(),
progressBarFactory.new(this)
)
}
}

View File

@ -1,6 +1,7 @@
package de.itkl.core_api.interfaces
import java.io.File
import java.io.InputStream
import java.nio.file.Path
interface FileProcessor {

View File

@ -3,32 +3,20 @@ package de.itkl.core_api.interfaces
import io.ktor.http.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.get
import java.io.File
import java.io.InputStream
import java.nio.file.Path
interface Resource {
val filename: String
val contentType: ContentType
// TODO: Find a better method to avoid those nulls. Maybe subtyping the interface
val length: Long?
val file: File?
val path: Path?
fun read(): InputStream
}
abstract class Resource : KoinComponent {
abstract val filename: String
abstract val contentType: ContentType
abstract val length: Long?
/**
* Automatically adds koin injectable decorators to reading/writing
* operations
*/
abstract class AbstractResource : Resource, KoinComponent {
abstract fun doRead(): InputStream
final override fun read(): InputStream {
protected abstract fun doRead(): InputStream
fun read(): InputStream {
return length?.let { length ->
get<ResourceReadDecorator>().decorate(
length = length,
doRead()
read()
)
} ?: doRead()
} ?: read()
}
}

View File

@ -1,16 +0,0 @@
package de.itkl.core_api.interfaces
import de.itkl.core_api.implementation.FileResource
import de.itkl.core_api.implementation.ProgressResource
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
class ResourceFactory : KoinComponent {
private val progressBarFactory by inject<ProgressBarFactory>()
fun file(file: File): Resource {
val resource = FileResource(file)
return ProgressResource(resource, progressBarFactory)
}
}

View File

@ -1,4 +0,0 @@
package de.itkl.core_api.interfaces.data
interface DataTable : Iterable<List<String>> {
val columns: List<String>
}

View File

@ -1,7 +1,6 @@
package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.ResourceFactory
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
@ -11,9 +10,10 @@ import kotlin.io.path.exists
private val Log = KotlinLogging.logger { }
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
private val resourceFactory: ResourceFactory by inject()
protected abstract val fileProcessor: List<FileProcessor>
private val progressBarFactory: ProgressBarFactory by inject()
suspend fun input(file: File) {
var currentFile = file
fileProcessor.forEach { processor ->
@ -22,8 +22,9 @@ abstract class FileProcessingPipeline(private val force: Boolean = false) : Koin
Log.info { "$target exists. Skipping" }
} else {
Log.info { "$target does not exists. Creating" }
val resource = resourceFactory.file(currentFile)
processor.process(resource)
val resource = FileResource(currentFile)
val progress = ProgressResource(resource, progressBarFactory)
processor.process(progress)
Log.info { "File created: $target" }
}
currentFile = target.toFile()

View File

@ -1,4 +1,4 @@
package de.itkl.core_api.interfaces
package de.itkl.fileprocessing
interface ProgressBarFactory {
fun new(resource: Resource): ProgressBar

View File

@ -1,6 +1,5 @@
package de.itkl.core_api.implementation
package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.ProgressBar
import java.io.InputStream
/**
@ -10,10 +9,9 @@ import java.io.InputStream
* @property updateOp The operation to be executed when the number of bytes read changes.
* @property bytesRead The number of bytes read from the input stream.
*/
internal class ProgressInputStream(
class ProgressInputStream(
private val inputStream: InputStream,
private val progressBar: ProgressBar
) : InputStream() {
private val progressBar: ProgressBar) : InputStream() {
@Volatile
var bytesRead: Long = 0
private set(value) {

View File

@ -0,0 +1,42 @@
package de.itkl.fileprocessing
import java.io.File
import java.io.InputStream
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.name
interface Resource {
val path: Path
val size: Long
val filename: String
fun toFile(): File = path.toFile()
fun length() = path.toFile().length()
fun read(): InputStream
}
class ProgressResource(
private val resource: Resource,
private val progressBarFactory: ProgressBarFactory
) : Resource by resource
{
override fun read(): InputStream {
return ProgressInputStream(
resource.read(),
progressBarFactory.new(this)
)
}
}
class FileResource(override val path: Path) : Resource {
constructor(file: File): this(file.toPath())
override val size: Long by lazy { path.toFile().length() }
override val filename: String
get() = path.name
override fun read(): InputStream {
return Files.newInputStream(path)
}
}

View File

@ -1,15 +0,0 @@
plugins {
kotlin("plugin.serialization") version embeddedKotlinVersion
}
val ktorVersion: String by project
dependencies {
api(project(":libraries:core-api"))
api("io.ktor:ktor-client-core:$ktorVersion")
api("io.ktor:ktor-client-core-jvm:$ktorVersion")
implementation("io.ktor:ktor-client-cio:$ktorVersion")
implementation("io.ktor:ktor-client-content-negotiation:$ktorVersion")
implementation("io.ktor:ktor-serialization-kotlinx-json:$ktorVersion")
}

View File

@ -1 +0,0 @@
ktorVersion=2.3.7

View File

@ -1,30 +0,0 @@
package de.itkl.httpClient.clients
import de.itkl.core_api.interfaces.Resource
import io.github.oshai.kotlinlogging.KotlinLogging
import io.ktor.client.*
import io.ktor.client.call.*
import io.ktor.client.request.*
import io.ktor.client.statement.*
import io.ktor.http.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
private val Log = KotlinLogging.logger { }
class MsOcr: KoinComponent {
private val httpClient: HttpClient by inject()
suspend fun ocr(resource: Resource): MsOcrResponse {
val response = httpClient.post {
url("http://10.54.150.152:5000/vision/v3.2/read/syncAnalyze")
parameters {
append("language", "de")
append("readingOrder", "natural")
}
contentType(resource.contentType)
setBody(resource.read())
}
println("got response: ${response.status} in ${response.responseTime}")
return response.body()
}
}

View File

@ -1,81 +0,0 @@
package de.itkl.httpClient.clients
import kotlinx.datetime.Instant
import kotlinx.datetime.LocalDateTime
import kotlinx.serialization.SerialName
import kotlinx.serialization.Serializable
@Serializable
data class MsOcrResponse(
@SerialName("analyzeResult")
val analyzeResult: AnalyzeResult,
@SerialName("createdDateTime")
val createdDateTime: Instant, // 2023-12-29T21:02:30Z
@SerialName("lastUpdatedDateTime")
val lastUpdatedDateTime: Instant, // 2023-12-29T21:02:31Z
@SerialName("status")
val status: String // succeeded
) {
@Serializable
data class AnalyzeResult(
@SerialName("modelVersion")
val modelVersion: String, // 2022-04-30
@SerialName("readResults")
val readResults: List<ReadResult>,
@SerialName("version")
val version: String // 3.2.0
) {
@Serializable
data class ReadResult(
@SerialName("angle")
val angle: Int, // 0
@SerialName("height")
val height: Int, // 3507
@SerialName("lines")
val lines: List<Line>,
@SerialName("page")
val page: Int, // 1
@SerialName("unit")
val unit: String, // pixel
@SerialName("width")
val width: Int // 2481
) {
@Serializable
data class Line(
@SerialName("appearance")
val appearance: Appearance,
@SerialName("boundingBox")
val boundingBox: List<Int>,
@SerialName("text")
val text: String, // Franz Mustermann
@SerialName("words")
val words: List<Word>
) {
@Serializable
data class Appearance(
@SerialName("style")
val style: Style
) {
@Serializable
data class Style(
@SerialName("confidence")
val confidence: Double, // 0.972
@SerialName("name")
val name: String // other
)
}
@Serializable
data class Word(
@SerialName("boundingBox")
val boundingBox: List<Int>,
@SerialName("confidence")
val confidence: Double, // 0.998
@SerialName("text")
val text: String // Franz
)
}
}
}
}

View File

@ -1,14 +0,0 @@
package de.itkl.httpClient
import io.ktor.client.*
import io.ktor.client.engine.cio.*
import io.ktor.client.plugins.contentnegotiation.*
import io.ktor.serialization.kotlinx.json.*
fun createHttpClient(): HttpClient {
return HttpClient(CIO) {
install(ContentNegotiation) {
json()
}
}
}

View File

@ -1,10 +0,0 @@
package de.itkl.httpClient
import de.itkl.httpClient.clients.MsOcr
import io.ktor.client.*
import org.koin.dsl.module
val httpClientModule = module {
single<HttpClient> { createHttpClient() }
single<MsOcr> { MsOcr() }
}

View File

@ -1,36 +0,0 @@
package de.itkl.httpClient.clients
import de.itkl.core_api.coreApiModule
import de.itkl.core_api.implementation.FileResource
import de.itkl.core_api.interfaces.Resource
import de.itkl.httpClient.httpClientModule
import kotlinx.coroutines.runBlocking
import org.junit.Rule
import org.junit.jupiter.api.BeforeEach
import org.junit.jupiter.api.Test
import org.koin.core.component.inject
import org.koin.core.context.startKoin
import org.koin.test.KoinTest
import java.nio.file.Paths
class MsOcrTest : KoinTest {
@BeforeEach
fun start() {
startKoin {
printLogger()
modules(
coreApiModule,
httpClientModule)
}
}
@Test
fun `can create a request`() = runBlocking {
val msOcrClient: MsOcr by inject()
val resource = FileResource(Paths.get("../../assets/xs-reg/00001.jpg").toAbsolutePath())
val response = msOcrClient.ocr(resource)
println(response)
Unit
}
}

View File

@ -1,5 +1,4 @@
dependencies {
api(project(":libraries:core-api"))
api("org.apache.lucene:lucene-analysis-common:9.9.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")

View File

@ -2,16 +2,30 @@ package de.itkl.textprocessing
import kotlinx.coroutines.flow.*
class Histogram(
private val histo: MutableMap<String,UInt> = mutableMapOf()
) : Iterable<Pair<String, UInt>>{
class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : Iterable<Pair<String, UInt>>{
companion object {
suspend fun from(flow: Flow<String>): Histogram {
return Histogram().apply {
flow.collect(this::add)
}
}
fun fromBagOfWords(bagOfWords: BagOfWords): Histogram {
val result = Histogram()
bagOfWords.forEach(result::add)
return result
}
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
val result = Histogram()
flow.collect() { value ->
value.forEach(result::add)
}
return result
}
fun from(sequence: Sequence<Map<String, String>>): Histogram {
val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
.toMutableMap()

View File

@ -2,7 +2,6 @@ package de.itkl.textprocessing
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.core_api.interfaces.Resource
import java.io.File
import java.nio.file.Path
@ -17,9 +16,9 @@ class HistogramCsvStorage {
}
}
}
suspend fun read(resource: Resource): Histogram {
suspend fun read(file: File): Histogram {
return csvReader { }
.openAsync(resource.read()) {
.openAsync(file) {
val sequence = readAllWithHeaderAsSequence()
Histogram.from(sequence)
}

View File

@ -1,7 +1,6 @@
dependencies {
api(project(":libraries:textprocessing"))
api(project(":libraries:fileprocessing"))
api(project(":libraries:core-api"))
implementation("com.github.ajalt.mordant:mordant:2.2.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")

View File

@ -1,7 +1,7 @@
package de.itkl.tfidf
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.Resource
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource
import de.itkl.processing.parallelUnordered
import de.itkl.textprocessing.*
import de.itkl.textprocessing.interfaces.Stemmer
@ -24,8 +24,8 @@ class DocumentFrequency : FileProcessor, KoinComponent {
}
override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path!!)}" }
val resultFile = willProduce(resource.path!!).toFile()
Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile()
val (numDocs, histogram) = TextFile(resource.read())
.splitByEmptyLines()
.withIndex()

View File

@ -1,39 +1,43 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.ProgressBarFactory
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.textprocessing.HistogramCsvStorage
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.ln
import kotlin.math.log
import kotlin.math.log10
import kotlin.math.log2
private val Log = KotlinLogging.logger { }
class InverseDocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
}
override suspend fun process(resource: Resource): File {
val histogram = HistogramCsvStorage().read(resource)
val histogram = HistogramCsvStorage().read(resource.toFile())
val numDocs = histogram
.find { (word, _) -> word == "\$numDocs" }!!
.find { (word, count) -> word == "\$numDocs" }!!
.second.toInt()
val progressBarFactory: ProgressBarFactory by inject()
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progress ->
csvWriter().openAsync(willProduce(resource.path!!).toFile(), append = false) {
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
writeRow("word", "idf")
histogram.forEach { (word, count) ->
writeRow(word, idf(numDocs, count))
progress.step()
progess.step()
}
}
resource.path!!.toFile()
resource.path.toFile()
}
}

View File

@ -1,11 +1,11 @@
package de.itkl.tui.implementation
package de.itkl.tfidf
import com.github.ajalt.mordant.animation.ProgressAnimation
import com.github.ajalt.mordant.animation.progressAnimation
import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.core_api.interfaces.Resource
import de.itkl.core_api.interfaces.ProgressBar
import de.itkl.core_api.interfaces.ProgressBarFactory
import de.itkl.fileprocessing.ProgressBar
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
class TerminalProgressBarFactory : ProgressBarFactory {
private val terminal = Terminal()
@ -17,7 +17,7 @@ class TerminalProgressBarFactory : ProgressBarFactory {
completed()
timeRemaining()
}
return TerminalProgressBar(animation, resource.length!!)
return TerminalProgressBar(animation, resource.length())
}
override fun new(name: String, max: Long): ProgressBar {

View File

@ -1,7 +1,9 @@
package de.itkl.tfidf
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
override val fileProcessor = listOf<FileProcessor>(

View File

@ -1,4 +0,0 @@
dependencies {
api(project(":libraries:core-api"))
implementation("com.github.ajalt.mordant:mordant:2.2.0")
}

View File

@ -1,4 +0,0 @@
package de.itkl.tui.implementation
class TerminalDataTableReporter {
}

View File

@ -1,14 +0,0 @@
package de.itkl.tui
import de.itkl.core_api.interfaces.ProgressBarFactory
import de.itkl.tui.implementation.TerminalProgressBarFactory
import org.koin.dsl.module
/**
* Add terminal ui capabilities
*/
val tuiModule = module {
single<ProgressBarFactory> {
TerminalProgressBarFactory()
}
}