From ca51b503061579ec6178eb9d82a837d821c564bf Mon Sep 17 00:00:00 2001 From: Timo Bryant Date: Fri, 15 Dec 2023 21:14:36 +0100 Subject: [PATCH] Refactor code and add functionality for term frequency calculation The major changes in this commit involve code refactoring and adding new functionality to calculate Term frequency (TF). The TF is now computed as a separate step from the TF-IDF calculation, which improves the modularity and maintainability of the code. Additionally, an unnecessary test file (MessageUtilsTest.kt) has been removed, and various dependencies have been updated or removed as needed. A few changes were also made to improve the readability and usability of the code. --- app/src/main/kotlin/docthor/app/App.kt | 15 +++++---- .../kotlin/docthor/app/MessageUtilsTest.kt | 14 --------- ...cthor.kotlin-common-conventions.gradle.kts | 5 --- .../de/itkl/textprocessing/Histogram.kt | 10 ++++-- .../textprocessing/HistogramCsvStorage.kt | 10 ++++-- libraries/tfidf/build.gradle.kts | 3 +- .../tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt | 31 +++++++++++++++++++ .../src/main/kotlin/de/itkl/tfidf/TfIdf.kt | 23 +++++++++----- 8 files changed, 72 insertions(+), 39 deletions(-) delete mode 100644 app/src/test/kotlin/docthor/app/MessageUtilsTest.kt create mode 100644 libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt diff --git a/app/src/main/kotlin/docthor/app/App.kt b/app/src/main/kotlin/docthor/app/App.kt index d3b140a..8426865 100644 --- a/app/src/main/kotlin/docthor/app/App.kt +++ b/app/src/main/kotlin/docthor/app/App.kt @@ -1,7 +1,5 @@ package docthor.app -import java.nio.file.Paths - import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.option @@ -11,8 +9,9 @@ import com.github.ajalt.clikt.parameters.types.file import de.itkl.tfidf.Language import de.itkl.tfidf.TfIdf import kotlinx.coroutines.runBlocking +import java.io.File -class MainCommand : CliktCommand() { +class ComputeTf : CliktCommand() { private val corpus by option(help = "corpus") .file() .required() @@ -21,10 +20,14 @@ class MainCommand : CliktCommand() { .required() override fun run() = runBlocking { - TfIdf().buildTfIdfDict( - corpus, language + val tfIdf = TfIdf() + val histogram = tfIdf.computeTf( + corpus, + language ) + tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile()) + } } -fun main(args: Array) = MainCommand().main(args) +fun main(args: Array) = ComputeTf().main(args) diff --git a/app/src/test/kotlin/docthor/app/MessageUtilsTest.kt b/app/src/test/kotlin/docthor/app/MessageUtilsTest.kt deleted file mode 100644 index f3c02fe..0000000 --- a/app/src/test/kotlin/docthor/app/MessageUtilsTest.kt +++ /dev/null @@ -1,14 +0,0 @@ -/* - * This Kotlin source file was generated by the Gradle "init" task. - */ -package docthor.app - -import org.junit.jupiter.api.Test - -import org.junit.jupiter.api.Assertions.assertEquals - -class MessageUtilsTest { - @Test fun testGetMessage() { - assertEquals("Hello World!", MessageUtils.getMessage()) - } -} diff --git a/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts b/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts index 9230cc5..1fc4e87 100644 --- a/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts +++ b/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts @@ -5,16 +5,11 @@ plugins { } repositories { - // Use Maven Central for resolving dependencies. mavenCentral() } dependencies { implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3") - constraints { - // Define dependency versions as constraints - implementation("org.apache.commons:commons-text:1.10.0") - } } testing { diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt index d4acf9e..d3721c4 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt @@ -2,8 +2,7 @@ package de.itkl.textprocessing import kotlinx.coroutines.flow.Flow -class Histogram : Iterable>{ - private val histo: MutableMap = mutableMapOf() +class Histogram(private val histo: MutableMap = mutableMapOf()) : Iterable>{ companion object { suspend fun from(flow: Flow): Histogram { @@ -11,6 +10,12 @@ class Histogram : Iterable>{ flow.collect(this::add) } } + + fun from(sequence: Sequence>): Histogram { + val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() } + .toMutableMap() + return Histogram(histo) + } } fun add(word: String) { @@ -20,7 +25,6 @@ class Histogram : Iterable>{ } val size get() = histo.size - override fun iterator(): Iterator> { return iterator { histo.forEach { (t, u) -> yield(t to u) } diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt index b08b32c..fca404f 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt @@ -1,11 +1,11 @@ package de.itkl.textprocessing +import com.github.doyaaaaaken.kotlincsv.dsl.csvReader import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter import java.io.File import java.nio.file.Path class HistogramCsvStorage { - suspend fun save(histogram: Histogram, file: File, progressOp: (Long) -> Unit = {}) { csvWriter {} .openAsync(file, append = false) { @@ -16,7 +16,11 @@ class HistogramCsvStorage { } } } - fun read(path: Path): Histogram { - TODO() + suspend fun read(file: File): Histogram { + return csvReader { } + .openAsync(file) { + val sequence = readAllWithHeaderAsSequence() + Histogram.from(sequence) + } } } \ No newline at end of file diff --git a/libraries/tfidf/build.gradle.kts b/libraries/tfidf/build.gradle.kts index 438770d..738ede3 100644 --- a/libraries/tfidf/build.gradle.kts +++ b/libraries/tfidf/build.gradle.kts @@ -3,6 +3,7 @@ plugins { } dependencies { - implementation(project(":libraries:textprocessing")) + api(project(":libraries:textprocessing")) implementation("com.github.ajalt.mordant:mordant:2.2.0") + implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2") } diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt new file mode 100644 index 0000000..ab80429 --- /dev/null +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt @@ -0,0 +1,31 @@ +package de.itkl.tfidf + +import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter +import de.itkl.textprocessing.Histogram +import io.github.oshai.kotlinlogging.KotlinLogging +import java.io.File +import kotlin.math.max + +private val Log = KotlinLogging.logger { } +class Tf { + private val data: MutableMap = mutableMapOf() + fun update(histogram: Histogram): Tf { + val max = histogram.maxOf { (_, count) -> count } + .toDouble() + histogram.forEach { (word, count) -> + val tf = count.toDouble() / max + data[word] = tf + } + return this + } + + suspend fun saveToCsv(file: File) { + csvWriter {} + .openAsync(file, append = false) { + writeRow("term", "frequency") + data.forEach { (t, u) -> + writeRow(t, u) + } + } + } +} \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt index 3c24626..3212c60 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt @@ -1,27 +1,30 @@ package de.itkl.tfidf -import com.github.ajalt.mordant.animation.progressAnimation import com.github.ajalt.mordant.terminal.Terminal import de.itkl.textprocessing.Histogram import de.itkl.textprocessing.HistogramCsvStorage import de.itkl.textprocessing.TextFile import io.github.oshai.kotlinlogging.KotlinLogging import kotlinx.coroutines.flow.map -import kotlinx.coroutines.flow.take -import kotlinx.coroutines.withTimeoutOrNull import org.tartarus.snowball.SnowballStemmer import org.tartarus.snowball.ext.GermanStemmer -import java.awt.SystemColor.text import java.io.File +import kotlin.io.path.exists private val Log = KotlinLogging.logger { } class TfIdf { - suspend fun buildTfIdfDict( + suspend fun computeTf( corpus: File, language: Language - ) { + ): Histogram { Log.info { "Processing $corpus" } + val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv") + + if(destination.exists()) { + return HistogramCsvStorage().read(destination.toFile()) + } + val filesize = corpus.length() val t = Terminal() @@ -31,12 +34,18 @@ class TfIdf { Histogram.from(words) } - val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.cv") t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) { HistogramCsvStorage() .save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)} } + return histogram + } + suspend fun normalizeTf(histogram: Histogram, destination: File) { + Log.info { "Write tf to $destination" } + Tf() + .update(histogram) + .saveToCsv(destination) } private fun stemmer(language: Language): SnowballStemmer {