diff --git a/app/src/main/kotlin/docthor/app/App.kt b/app/src/main/kotlin/docthor/app/App.kt index d3b140a..8426865 100644 --- a/app/src/main/kotlin/docthor/app/App.kt +++ b/app/src/main/kotlin/docthor/app/App.kt @@ -1,7 +1,5 @@ package docthor.app -import java.nio.file.Paths - import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.option @@ -11,8 +9,9 @@ import com.github.ajalt.clikt.parameters.types.file import de.itkl.tfidf.Language import de.itkl.tfidf.TfIdf import kotlinx.coroutines.runBlocking +import java.io.File -class MainCommand : CliktCommand() { +class ComputeTf : CliktCommand() { private val corpus by option(help = "corpus") .file() .required() @@ -21,10 +20,14 @@ class MainCommand : CliktCommand() { .required() override fun run() = runBlocking { - TfIdf().buildTfIdfDict( - corpus, language + val tfIdf = TfIdf() + val histogram = tfIdf.computeTf( + corpus, + language ) + tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile()) + } } -fun main(args: Array) = MainCommand().main(args) +fun main(args: Array) = ComputeTf().main(args) diff --git a/app/src/test/kotlin/docthor/app/MessageUtilsTest.kt b/app/src/test/kotlin/docthor/app/MessageUtilsTest.kt deleted file mode 100644 index f3c02fe..0000000 --- a/app/src/test/kotlin/docthor/app/MessageUtilsTest.kt +++ /dev/null @@ -1,14 +0,0 @@ -/* - * This Kotlin source file was generated by the Gradle "init" task. - */ -package docthor.app - -import org.junit.jupiter.api.Test - -import org.junit.jupiter.api.Assertions.assertEquals - -class MessageUtilsTest { - @Test fun testGetMessage() { - assertEquals("Hello World!", MessageUtils.getMessage()) - } -} diff --git a/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts b/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts index 9230cc5..1fc4e87 100644 --- a/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts +++ b/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts @@ -5,16 +5,11 @@ plugins { } repositories { - // Use Maven Central for resolving dependencies. mavenCentral() } dependencies { implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3") - constraints { - // Define dependency versions as constraints - implementation("org.apache.commons:commons-text:1.10.0") - } } testing { diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt index d4acf9e..d3721c4 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt @@ -2,8 +2,7 @@ package de.itkl.textprocessing import kotlinx.coroutines.flow.Flow -class Histogram : Iterable>{ - private val histo: MutableMap = mutableMapOf() +class Histogram(private val histo: MutableMap = mutableMapOf()) : Iterable>{ companion object { suspend fun from(flow: Flow): Histogram { @@ -11,6 +10,12 @@ class Histogram : Iterable>{ flow.collect(this::add) } } + + fun from(sequence: Sequence>): Histogram { + val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() } + .toMutableMap() + return Histogram(histo) + } } fun add(word: String) { @@ -20,7 +25,6 @@ class Histogram : Iterable>{ } val size get() = histo.size - override fun iterator(): Iterator> { return iterator { histo.forEach { (t, u) -> yield(t to u) } diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt index b08b32c..fca404f 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt @@ -1,11 +1,11 @@ package de.itkl.textprocessing +import com.github.doyaaaaaken.kotlincsv.dsl.csvReader import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter import java.io.File import java.nio.file.Path class HistogramCsvStorage { - suspend fun save(histogram: Histogram, file: File, progressOp: (Long) -> Unit = {}) { csvWriter {} .openAsync(file, append = false) { @@ -16,7 +16,11 @@ class HistogramCsvStorage { } } } - fun read(path: Path): Histogram { - TODO() + suspend fun read(file: File): Histogram { + return csvReader { } + .openAsync(file) { + val sequence = readAllWithHeaderAsSequence() + Histogram.from(sequence) + } } } \ No newline at end of file diff --git a/libraries/tfidf/build.gradle.kts b/libraries/tfidf/build.gradle.kts index 438770d..738ede3 100644 --- a/libraries/tfidf/build.gradle.kts +++ b/libraries/tfidf/build.gradle.kts @@ -3,6 +3,7 @@ plugins { } dependencies { - implementation(project(":libraries:textprocessing")) + api(project(":libraries:textprocessing")) implementation("com.github.ajalt.mordant:mordant:2.2.0") + implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2") } diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt new file mode 100644 index 0000000..ab80429 --- /dev/null +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt @@ -0,0 +1,31 @@ +package de.itkl.tfidf + +import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter +import de.itkl.textprocessing.Histogram +import io.github.oshai.kotlinlogging.KotlinLogging +import java.io.File +import kotlin.math.max + +private val Log = KotlinLogging.logger { } +class Tf { + private val data: MutableMap = mutableMapOf() + fun update(histogram: Histogram): Tf { + val max = histogram.maxOf { (_, count) -> count } + .toDouble() + histogram.forEach { (word, count) -> + val tf = count.toDouble() / max + data[word] = tf + } + return this + } + + suspend fun saveToCsv(file: File) { + csvWriter {} + .openAsync(file, append = false) { + writeRow("term", "frequency") + data.forEach { (t, u) -> + writeRow(t, u) + } + } + } +} \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt index 3c24626..3212c60 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt @@ -1,27 +1,30 @@ package de.itkl.tfidf -import com.github.ajalt.mordant.animation.progressAnimation import com.github.ajalt.mordant.terminal.Terminal import de.itkl.textprocessing.Histogram import de.itkl.textprocessing.HistogramCsvStorage import de.itkl.textprocessing.TextFile import io.github.oshai.kotlinlogging.KotlinLogging import kotlinx.coroutines.flow.map -import kotlinx.coroutines.flow.take -import kotlinx.coroutines.withTimeoutOrNull import org.tartarus.snowball.SnowballStemmer import org.tartarus.snowball.ext.GermanStemmer -import java.awt.SystemColor.text import java.io.File +import kotlin.io.path.exists private val Log = KotlinLogging.logger { } class TfIdf { - suspend fun buildTfIdfDict( + suspend fun computeTf( corpus: File, language: Language - ) { + ): Histogram { Log.info { "Processing $corpus" } + val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv") + + if(destination.exists()) { + return HistogramCsvStorage().read(destination.toFile()) + } + val filesize = corpus.length() val t = Terminal() @@ -31,12 +34,18 @@ class TfIdf { Histogram.from(words) } - val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.cv") t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) { HistogramCsvStorage() .save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)} } + return histogram + } + suspend fun normalizeTf(histogram: Histogram, destination: File) { + Log.info { "Write tf to $destination" } + Tf() + .update(histogram) + .saveToCsv(destination) } private fun stemmer(language: Language): SnowballStemmer {