Refactor code and add functionality for term frequency calculation

The major changes in this commit involve code refactoring and adding new functionality to calculate Term frequency (TF). The TF is now computed as a separate step from the TF-IDF calculation, which improves the modularity and maintainability of the code. Additionally, an unnecessary test file (MessageUtilsTest.kt) has been removed, and various dependencies have been updated or removed as needed. A few changes were also made to improve the readability and usability of the code.
develop
Timo Bryant 2023-12-15 21:14:36 +01:00
parent 351ab9b9e3
commit ca51b50306
8 changed files with 72 additions and 39 deletions

View File

@ -1,7 +1,5 @@
package docthor.app package docthor.app
import java.nio.file.Paths
import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.core.CliktCommand
import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.option
@ -11,8 +9,9 @@ import com.github.ajalt.clikt.parameters.types.file
import de.itkl.tfidf.Language import de.itkl.tfidf.Language
import de.itkl.tfidf.TfIdf import de.itkl.tfidf.TfIdf
import kotlinx.coroutines.runBlocking import kotlinx.coroutines.runBlocking
import java.io.File
class MainCommand : CliktCommand() { class ComputeTf : CliktCommand() {
private val corpus by option(help = "corpus") private val corpus by option(help = "corpus")
.file() .file()
.required() .required()
@ -21,10 +20,14 @@ class MainCommand : CliktCommand() {
.required() .required()
override fun run() = runBlocking { override fun run() = runBlocking {
TfIdf().buildTfIdfDict( val tfIdf = TfIdf()
corpus, language val histogram = tfIdf.computeTf(
corpus,
language
) )
tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
} }
} }
fun main(args: Array<String>) = MainCommand().main(args) fun main(args: Array<String>) = ComputeTf().main(args)

View File

@ -1,14 +0,0 @@
/*
* This Kotlin source file was generated by the Gradle "init" task.
*/
package docthor.app
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.Assertions.assertEquals
class MessageUtilsTest {
@Test fun testGetMessage() {
assertEquals("Hello World!", MessageUtils.getMessage())
}
}

View File

@ -5,16 +5,11 @@ plugins {
} }
repositories { repositories {
// Use Maven Central for resolving dependencies.
mavenCentral() mavenCentral()
} }
dependencies { dependencies {
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3") implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
constraints {
// Define dependency versions as constraints
implementation("org.apache.commons:commons-text:1.10.0")
}
} }
testing { testing {

View File

@ -2,8 +2,7 @@ package de.itkl.textprocessing
import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.Flow
class Histogram : Iterable<Pair<String, UInt>>{ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : Iterable<Pair<String, UInt>>{
private val histo: MutableMap<String,UInt> = mutableMapOf()
companion object { companion object {
suspend fun from(flow: Flow<String>): Histogram { suspend fun from(flow: Flow<String>): Histogram {
@ -11,6 +10,12 @@ class Histogram : Iterable<Pair<String, UInt>>{
flow.collect(this::add) flow.collect(this::add)
} }
} }
fun from(sequence: Sequence<Map<String, String>>): Histogram {
val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
.toMutableMap()
return Histogram(histo)
}
} }
fun add(word: String) { fun add(word: String) {
@ -20,7 +25,6 @@ class Histogram : Iterable<Pair<String, UInt>>{
} }
val size get() = histo.size val size get() = histo.size
override fun iterator(): Iterator<Pair<String, UInt>> { override fun iterator(): Iterator<Pair<String, UInt>> {
return iterator { return iterator {
histo.forEach { (t, u) -> yield(t to u) } histo.forEach { (t, u) -> yield(t to u) }

View File

@ -1,11 +1,11 @@
package de.itkl.textprocessing package de.itkl.textprocessing
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import java.io.File import java.io.File
import java.nio.file.Path import java.nio.file.Path
class HistogramCsvStorage { class HistogramCsvStorage {
suspend fun save(histogram: Histogram, file: File, progressOp: (Long) -> Unit = {}) { suspend fun save(histogram: Histogram, file: File, progressOp: (Long) -> Unit = {}) {
csvWriter {} csvWriter {}
.openAsync(file, append = false) { .openAsync(file, append = false) {
@ -16,7 +16,11 @@ class HistogramCsvStorage {
} }
} }
} }
fun read(path: Path): Histogram { suspend fun read(file: File): Histogram {
TODO() return csvReader { }
.openAsync(file) {
val sequence = readAllWithHeaderAsSequence()
Histogram.from(sequence)
}
} }
} }

View File

@ -3,6 +3,7 @@ plugins {
} }
dependencies { dependencies {
implementation(project(":libraries:textprocessing")) api(project(":libraries:textprocessing"))
implementation("com.github.ajalt.mordant:mordant:2.2.0") implementation("com.github.ajalt.mordant:mordant:2.2.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
} }

View File

@ -0,0 +1,31 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.textprocessing.Histogram
import io.github.oshai.kotlinlogging.KotlinLogging
import java.io.File
import kotlin.math.max
private val Log = KotlinLogging.logger { }
class Tf {
private val data: MutableMap<String, Double> = mutableMapOf()
fun update(histogram: Histogram): Tf {
val max = histogram.maxOf { (_, count) -> count }
.toDouble()
histogram.forEach { (word, count) ->
val tf = count.toDouble() / max
data[word] = tf
}
return this
}
suspend fun saveToCsv(file: File) {
csvWriter {}
.openAsync(file, append = false) {
writeRow("term", "frequency")
data.forEach { (t, u) ->
writeRow(t, u)
}
}
}
}

View File

@ -1,27 +1,30 @@
package de.itkl.tfidf package de.itkl.tfidf
import com.github.ajalt.mordant.animation.progressAnimation
import com.github.ajalt.mordant.terminal.Terminal import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.textprocessing.Histogram import de.itkl.textprocessing.Histogram
import de.itkl.textprocessing.HistogramCsvStorage import de.itkl.textprocessing.HistogramCsvStorage
import de.itkl.textprocessing.TextFile import de.itkl.textprocessing.TextFile
import io.github.oshai.kotlinlogging.KotlinLogging import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.flow.map import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.take
import kotlinx.coroutines.withTimeoutOrNull
import org.tartarus.snowball.SnowballStemmer import org.tartarus.snowball.SnowballStemmer
import org.tartarus.snowball.ext.GermanStemmer import org.tartarus.snowball.ext.GermanStemmer
import java.awt.SystemColor.text
import java.io.File import java.io.File
import kotlin.io.path.exists
private val Log = KotlinLogging.logger { } private val Log = KotlinLogging.logger { }
class TfIdf { class TfIdf {
suspend fun buildTfIdfDict( suspend fun computeTf(
corpus: File, corpus: File,
language: Language language: Language
) { ): Histogram {
Log.info { "Processing $corpus" } Log.info { "Processing $corpus" }
val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv")
if(destination.exists()) {
return HistogramCsvStorage().read(destination.toFile())
}
val filesize = corpus.length() val filesize = corpus.length()
val t = Terminal() val t = Terminal()
@ -31,12 +34,18 @@ class TfIdf {
Histogram.from(words) Histogram.from(words)
} }
val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.cv")
t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) { t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) {
HistogramCsvStorage() HistogramCsvStorage()
.save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)} .save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)}
} }
return histogram
}
suspend fun normalizeTf(histogram: Histogram, destination: File) {
Log.info { "Write tf to $destination" }
Tf()
.update(histogram)
.saveToCsv(destination)
} }
private fun stemmer(language: Language): SnowballStemmer { private fun stemmer(language: Language): SnowballStemmer {