Refactor code and add functionality for term frequency calculation

The major changes in this commit involve code refactoring and adding new functionality to calculate Term frequency (TF). The TF is now computed as a separate step from the TF-IDF calculation, which improves the modularity and maintainability of the code. Additionally, an unnecessary test file (MessageUtilsTest.kt) has been removed, and various dependencies have been updated or removed as needed. A few changes were also made to improve the readability and usability of the code.
develop
Timo Bryant 2023-12-15 21:14:36 +01:00
parent 351ab9b9e3
commit ca51b50306
8 changed files with 72 additions and 39 deletions

View File

@ -1,7 +1,5 @@
package docthor.app
import java.nio.file.Paths
import com.github.ajalt.clikt.core.CliktCommand
import com.github.ajalt.clikt.parameters.options.option
@ -11,8 +9,9 @@ import com.github.ajalt.clikt.parameters.types.file
import de.itkl.tfidf.Language
import de.itkl.tfidf.TfIdf
import kotlinx.coroutines.runBlocking
import java.io.File
class MainCommand : CliktCommand() {
class ComputeTf : CliktCommand() {
private val corpus by option(help = "corpus")
.file()
.required()
@ -21,10 +20,14 @@ class MainCommand : CliktCommand() {
.required()
override fun run() = runBlocking {
TfIdf().buildTfIdfDict(
corpus, language
val tfIdf = TfIdf()
val histogram = tfIdf.computeTf(
corpus,
language
)
tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
}
}
fun main(args: Array<String>) = MainCommand().main(args)
fun main(args: Array<String>) = ComputeTf().main(args)

View File

@ -1,14 +0,0 @@
/*
* This Kotlin source file was generated by the Gradle "init" task.
*/
package docthor.app
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.Assertions.assertEquals
class MessageUtilsTest {
@Test fun testGetMessage() {
assertEquals("Hello World!", MessageUtils.getMessage())
}
}

View File

@ -5,16 +5,11 @@ plugins {
}
repositories {
// Use Maven Central for resolving dependencies.
mavenCentral()
}
dependencies {
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
constraints {
// Define dependency versions as constraints
implementation("org.apache.commons:commons-text:1.10.0")
}
}
testing {

View File

@ -2,8 +2,7 @@ package de.itkl.textprocessing
import kotlinx.coroutines.flow.Flow
class Histogram : Iterable<Pair<String, UInt>>{
private val histo: MutableMap<String,UInt> = mutableMapOf()
class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : Iterable<Pair<String, UInt>>{
companion object {
suspend fun from(flow: Flow<String>): Histogram {
@ -11,6 +10,12 @@ class Histogram : Iterable<Pair<String, UInt>>{
flow.collect(this::add)
}
}
fun from(sequence: Sequence<Map<String, String>>): Histogram {
val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
.toMutableMap()
return Histogram(histo)
}
}
fun add(word: String) {
@ -20,7 +25,6 @@ class Histogram : Iterable<Pair<String, UInt>>{
}
val size get() = histo.size
override fun iterator(): Iterator<Pair<String, UInt>> {
return iterator {
histo.forEach { (t, u) -> yield(t to u) }

View File

@ -1,11 +1,11 @@
package de.itkl.textprocessing
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import java.io.File
import java.nio.file.Path
class HistogramCsvStorage {
suspend fun save(histogram: Histogram, file: File, progressOp: (Long) -> Unit = {}) {
csvWriter {}
.openAsync(file, append = false) {
@ -16,7 +16,11 @@ class HistogramCsvStorage {
}
}
}
fun read(path: Path): Histogram {
TODO()
suspend fun read(file: File): Histogram {
return csvReader { }
.openAsync(file) {
val sequence = readAllWithHeaderAsSequence()
Histogram.from(sequence)
}
}
}

View File

@ -3,6 +3,7 @@ plugins {
}
dependencies {
implementation(project(":libraries:textprocessing"))
api(project(":libraries:textprocessing"))
implementation("com.github.ajalt.mordant:mordant:2.2.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
}

View File

@ -0,0 +1,31 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.textprocessing.Histogram
import io.github.oshai.kotlinlogging.KotlinLogging
import java.io.File
import kotlin.math.max
private val Log = KotlinLogging.logger { }
class Tf {
private val data: MutableMap<String, Double> = mutableMapOf()
fun update(histogram: Histogram): Tf {
val max = histogram.maxOf { (_, count) -> count }
.toDouble()
histogram.forEach { (word, count) ->
val tf = count.toDouble() / max
data[word] = tf
}
return this
}
suspend fun saveToCsv(file: File) {
csvWriter {}
.openAsync(file, append = false) {
writeRow("term", "frequency")
data.forEach { (t, u) ->
writeRow(t, u)
}
}
}
}

View File

@ -1,27 +1,30 @@
package de.itkl.tfidf
import com.github.ajalt.mordant.animation.progressAnimation
import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.textprocessing.Histogram
import de.itkl.textprocessing.HistogramCsvStorage
import de.itkl.textprocessing.TextFile
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.take
import kotlinx.coroutines.withTimeoutOrNull
import org.tartarus.snowball.SnowballStemmer
import org.tartarus.snowball.ext.GermanStemmer
import java.awt.SystemColor.text
import java.io.File
import kotlin.io.path.exists
private val Log = KotlinLogging.logger { }
class TfIdf {
suspend fun buildTfIdfDict(
suspend fun computeTf(
corpus: File,
language: Language
) {
): Histogram {
Log.info { "Processing $corpus" }
val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv")
if(destination.exists()) {
return HistogramCsvStorage().read(destination.toFile())
}
val filesize = corpus.length()
val t = Terminal()
@ -31,12 +34,18 @@ class TfIdf {
Histogram.from(words)
}
val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.cv")
t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) {
HistogramCsvStorage()
.save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)}
}
return histogram
}
suspend fun normalizeTf(histogram: Histogram, destination: File) {
Log.info { "Write tf to $destination" }
Tf()
.update(histogram)
.saveToCsv(destination)
}
private fun stemmer(language: Language): SnowballStemmer {