Refactor code and add functionality for term frequency calculation
The major changes in this commit involve code refactoring and adding new functionality to calculate Term frequency (TF). The TF is now computed as a separate step from the TF-IDF calculation, which improves the modularity and maintainability of the code. Additionally, an unnecessary test file (MessageUtilsTest.kt) has been removed, and various dependencies have been updated or removed as needed. A few changes were also made to improve the readability and usability of the code.develop
parent
351ab9b9e3
commit
ca51b50306
|
|
@ -1,7 +1,5 @@
|
|||
package docthor.app
|
||||
|
||||
import java.nio.file.Paths
|
||||
|
||||
|
||||
import com.github.ajalt.clikt.core.CliktCommand
|
||||
import com.github.ajalt.clikt.parameters.options.option
|
||||
|
|
@ -11,8 +9,9 @@ import com.github.ajalt.clikt.parameters.types.file
|
|||
import de.itkl.tfidf.Language
|
||||
import de.itkl.tfidf.TfIdf
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import java.io.File
|
||||
|
||||
class MainCommand : CliktCommand() {
|
||||
class ComputeTf : CliktCommand() {
|
||||
private val corpus by option(help = "corpus")
|
||||
.file()
|
||||
.required()
|
||||
|
|
@ -21,10 +20,14 @@ class MainCommand : CliktCommand() {
|
|||
.required()
|
||||
|
||||
override fun run() = runBlocking {
|
||||
TfIdf().buildTfIdfDict(
|
||||
corpus, language
|
||||
val tfIdf = TfIdf()
|
||||
val histogram = tfIdf.computeTf(
|
||||
corpus,
|
||||
language
|
||||
)
|
||||
tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
fun main(args: Array<String>) = MainCommand().main(args)
|
||||
fun main(args: Array<String>) = ComputeTf().main(args)
|
||||
|
|
|
|||
|
|
@ -1,14 +0,0 @@
|
|||
/*
|
||||
* This Kotlin source file was generated by the Gradle "init" task.
|
||||
*/
|
||||
package docthor.app
|
||||
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import org.junit.jupiter.api.Assertions.assertEquals
|
||||
|
||||
class MessageUtilsTest {
|
||||
@Test fun testGetMessage() {
|
||||
assertEquals("Hello World!", MessageUtils.getMessage())
|
||||
}
|
||||
}
|
||||
|
|
@ -5,16 +5,11 @@ plugins {
|
|||
}
|
||||
|
||||
repositories {
|
||||
// Use Maven Central for resolving dependencies.
|
||||
mavenCentral()
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
|
||||
constraints {
|
||||
// Define dependency versions as constraints
|
||||
implementation("org.apache.commons:commons-text:1.10.0")
|
||||
}
|
||||
}
|
||||
|
||||
testing {
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@ package de.itkl.textprocessing
|
|||
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
|
||||
class Histogram : Iterable<Pair<String, UInt>>{
|
||||
private val histo: MutableMap<String,UInt> = mutableMapOf()
|
||||
class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : Iterable<Pair<String, UInt>>{
|
||||
|
||||
companion object {
|
||||
suspend fun from(flow: Flow<String>): Histogram {
|
||||
|
|
@ -11,6 +10,12 @@ class Histogram : Iterable<Pair<String, UInt>>{
|
|||
flow.collect(this::add)
|
||||
}
|
||||
}
|
||||
|
||||
fun from(sequence: Sequence<Map<String, String>>): Histogram {
|
||||
val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
|
||||
.toMutableMap()
|
||||
return Histogram(histo)
|
||||
}
|
||||
}
|
||||
|
||||
fun add(word: String) {
|
||||
|
|
@ -20,7 +25,6 @@ class Histogram : Iterable<Pair<String, UInt>>{
|
|||
}
|
||||
|
||||
val size get() = histo.size
|
||||
|
||||
override fun iterator(): Iterator<Pair<String, UInt>> {
|
||||
return iterator {
|
||||
histo.forEach { (t, u) -> yield(t to u) }
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
package de.itkl.textprocessing
|
||||
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
|
||||
class HistogramCsvStorage {
|
||||
|
||||
suspend fun save(histogram: Histogram, file: File, progressOp: (Long) -> Unit = {}) {
|
||||
csvWriter {}
|
||||
.openAsync(file, append = false) {
|
||||
|
|
@ -16,7 +16,11 @@ class HistogramCsvStorage {
|
|||
}
|
||||
}
|
||||
}
|
||||
fun read(path: Path): Histogram {
|
||||
TODO()
|
||||
suspend fun read(file: File): Histogram {
|
||||
return csvReader { }
|
||||
.openAsync(file) {
|
||||
val sequence = readAllWithHeaderAsSequence()
|
||||
Histogram.from(sequence)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -3,6 +3,7 @@ plugins {
|
|||
}
|
||||
|
||||
dependencies {
|
||||
implementation(project(":libraries:textprocessing"))
|
||||
api(project(":libraries:textprocessing"))
|
||||
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,31 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||
import de.itkl.textprocessing.Histogram
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import java.io.File
|
||||
import kotlin.math.max
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
class Tf {
|
||||
private val data: MutableMap<String, Double> = mutableMapOf()
|
||||
fun update(histogram: Histogram): Tf {
|
||||
val max = histogram.maxOf { (_, count) -> count }
|
||||
.toDouble()
|
||||
histogram.forEach { (word, count) ->
|
||||
val tf = count.toDouble() / max
|
||||
data[word] = tf
|
||||
}
|
||||
return this
|
||||
}
|
||||
|
||||
suspend fun saveToCsv(file: File) {
|
||||
csvWriter {}
|
||||
.openAsync(file, append = false) {
|
||||
writeRow("term", "frequency")
|
||||
data.forEach { (t, u) ->
|
||||
writeRow(t, u)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,27 +1,30 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import com.github.ajalt.mordant.animation.progressAnimation
|
||||
import com.github.ajalt.mordant.terminal.Terminal
|
||||
import de.itkl.textprocessing.Histogram
|
||||
import de.itkl.textprocessing.HistogramCsvStorage
|
||||
import de.itkl.textprocessing.TextFile
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import kotlinx.coroutines.flow.map
|
||||
import kotlinx.coroutines.flow.take
|
||||
import kotlinx.coroutines.withTimeoutOrNull
|
||||
import org.tartarus.snowball.SnowballStemmer
|
||||
import org.tartarus.snowball.ext.GermanStemmer
|
||||
import java.awt.SystemColor.text
|
||||
import java.io.File
|
||||
import kotlin.io.path.exists
|
||||
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
class TfIdf {
|
||||
suspend fun buildTfIdfDict(
|
||||
suspend fun computeTf(
|
||||
corpus: File,
|
||||
language: Language
|
||||
) {
|
||||
): Histogram {
|
||||
Log.info { "Processing $corpus" }
|
||||
val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv")
|
||||
|
||||
if(destination.exists()) {
|
||||
return HistogramCsvStorage().read(destination.toFile())
|
||||
}
|
||||
|
||||
val filesize = corpus.length()
|
||||
|
||||
val t = Terminal()
|
||||
|
|
@ -31,12 +34,18 @@ class TfIdf {
|
|||
Histogram.from(words)
|
||||
}
|
||||
|
||||
val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.cv")
|
||||
t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) {
|
||||
HistogramCsvStorage()
|
||||
.save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)}
|
||||
}
|
||||
return histogram
|
||||
}
|
||||
|
||||
suspend fun normalizeTf(histogram: Histogram, destination: File) {
|
||||
Log.info { "Write tf to $destination" }
|
||||
Tf()
|
||||
.update(histogram)
|
||||
.saveToCsv(destination)
|
||||
}
|
||||
|
||||
private fun stemmer(language: Language): SnowballStemmer {
|
||||
|
|
|
|||
Loading…
Reference in New Issue