Add text processing and tfidf libraries

This commit introduces two new libraries: textprocessing and tfidf. The textprocessing library provides classes to read words from a text file, generate histogram from the words, and store the histogram to a CSV file. The tfidf library adds support for term frequency–inverse document frequency (tf-idf) computation using the functionalities provided by the textprocessing library.
2023-12-15 17:17:27 +01:00 · 2023-12-15 17:17:27 +01:00 · 67d65cee93
parent 1259dc8764
commit 67d65cee93
8 changed files with 174 additions and 0 deletions
--- a/libraries/textprocessing/build.gradle.kts
+++ b/libraries/textprocessing/build.gradle.kts
@ -0,0 +1,9 @@
+plugins {
+    id("docthor.kotlin-library-conventions")
+}
+
+dependencies {
+    api("org.apache.lucene:lucene-analysis-common:9.9.0")
+    implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
+}
+
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt
@ -0,0 +1,27 @@
+package de.itkl.textprocessing
+
+import kotlinx.coroutines.flow.Flow
+
+class Histogram : Iterable<Pair<String, UInt>>{
+    private val histo: MutableMap<String,UInt> = mutableMapOf()
+
+    companion object {
+        suspend fun from(flow: Flow<String>): Histogram {
+            return Histogram().apply {
+                flow.collect(this::add)
+            }
+        }
+    }
+
+    fun add(word: String) {
+        histo.compute(word) { _, count ->
+             count?.let { it + 1u } ?: 1u
+        }
+    }
+
+    override fun iterator(): Iterator<Pair<String, UInt>> {
+        return iterator {
+            histo.forEach { (t, u) -> yield(t to u) }
+        }
+    }
+}
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt
@ -0,0 +1,16 @@
+package de.itkl.textprocessing
+
+import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
+import java.io.File
+import java.nio.file.Path
+
+class HistogramCsvStorage {
+
+    suspend fun save(histogram: Histogram, file: File) {
+        csvWriter().openAsync(file, append = false) {
+        }
+    }
+    fun read(path: Path): Histogram {
+        TODO()
+    }
+}
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/ProgressInputStream.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/ProgressInputStream.kt
@ -0,0 +1,39 @@
+package de.itkl.textprocessing
+
+import java.io.InputStream
+
+/**
+ * Represents an input stream that tracks the progress of reading from an underlying input stream.
+ *
+ * @property inputStream The underlying input stream to read from.
+ * @property updateOp The operation to be executed when the number of bytes read changes.
+ * @property bytesRead The number of bytes read from the input stream.
+ */
+class ProgressInputStream(
+    private val inputStream: InputStream,
+    private val updateOp: (Long) -> Unit) : InputStream() {
+    @Volatile
+    var bytesRead: Long = 0
+        private set(value) {
+            field = value
+            updateOp(value)
+        }
+
+    override fun read(): Int {
+        val byte = inputStream.read()
+        if (byte != -1) {
+            bytesRead++
+        }
+        return byte
+    }
+    override fun read(b: ByteArray, off: Int, len: Int): Int {
+        val bytesRead = inputStream.read(b, off, len)
+        if (bytesRead != -1) {
+            this.bytesRead += bytesRead
+        }
+        return bytesRead
+    }
+    override fun read(b: ByteArray): Int {
+        return this.read(b, 0, b.size)
+    }
+}
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt
@ -0,0 +1,30 @@
+package de.itkl.textprocessing
+
+import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.flow
+import kotlinx.coroutines.flow.onCompletion
+import org.apache.lucene.analysis.standard.StandardTokenizer
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
+import org.apache.lucene.util.AttributeFactory
+import java.io.File
+import java.io.FileReader
+import java.io.InputStreamReader
+
+
+class TextFile(val file: File) {
+    fun words(progressOp: (read: Long) -> Unit = {}): Flow<String> {
+        val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
+        val tokenizer = StandardTokenizer(factory)
+        val reader = ProgressInputStream(file.inputStream(), progressOp)
+        tokenizer.setReader(InputStreamReader(reader))
+        tokenizer.reset()
+        val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
+        return flow {
+            while (tokenizer.incrementToken()) {
+                emit(attr.toString())
+            }
+        }.onCompletion {
+            tokenizer.close()
+        }
+    }
+}
--- a/libraries/tfidf/build.gradle.kts
+++ b/libraries/tfidf/build.gradle.kts
@ -1,3 +1,8 @@
 plugins {
    id("docthor.kotlin-library-conventions")
 }
+
+dependencies {
+    implementation(project(":libraries:textprocessing"))
+    implementation("com.github.ajalt.mordant:mordant:2.2.0")
+}
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Language.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Language.kt
@ -0,0 +1,5 @@
+package de.itkl.tfidf
+
+enum class Language {
+    DE
+}
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt
@ -0,0 +1,43 @@
+package de.itkl.tfidf
+
+import de.itkl.textprocessing.Histogram
+import de.itkl.textprocessing.TextFile
+import io.github.oshai.kotlinlogging.KotlinLogging
+import kotlinx.coroutines.flow.map
+import kotlinx.coroutines.flow.take
+import kotlinx.coroutines.withTimeoutOrNull
+import org.tartarus.snowball.SnowballStemmer
+import org.tartarus.snowball.ext.GermanStemmer
+import java.io.File
+
+
+private val Log = KotlinLogging.logger { }
+class TfIdf {
+    suspend fun buildTfIdfDict(
+        corpus: File,
+        language: Language
+    ) {
+        Log.info { "Processing $corpus" }
+        val stemmer = stemmer(language)
+        val words = TextFile(corpus).words()
+            .take(100)
+            .map { stemmer.stem(it) }
+        val histogram = Histogram.from(words)
+        histogram.forEach { (word, count) ->
+            println("$word\t$count")
+        }
+
+    }
+
+    private fun stemmer(language: Language): SnowballStemmer {
+        return when(language) {
+            Language.DE -> GermanStemmer()
+        }
+    }
+
+    private fun SnowballStemmer.stem(word: String): String {
+        current = word
+        stem()
+        return current
+    }
+}