Add text processing and tfidf libraries

This commit introduces two new libraries: textprocessing and tfidf. The textprocessing library provides classes to read words from a text file, generate histogram from the words, and store the histogram to a CSV file. The tfidf library adds support for term frequency–inverse document frequency (tf-idf) computation using the functionalities provided by the textprocessing library.
2023-12-15 17:17:27 +01:00 · 2023-12-15 17:17:27 +01:00 · 67d65cee93
parent 1259dc8764
commit 67d65cee93
8 changed files with 174 additions and 0 deletions
--- a/libraries/textprocessing/build.gradle.kts
+++ b/libraries/textprocessing/build.gradle.kts
@ -0,0 +1,9 @@
 plugins {
    id("docthor.kotlin-library-conventions")
 }
 dependencies {
    api("org.apache.lucene:lucene-analysis-common:9.9.0")
    implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
 }
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt
@ -0,0 +1,27 @@
 package de.itkl.textprocessing
 import kotlinx.coroutines.flow.Flow
 class Histogram : Iterable<Pair<String, UInt>>{
    private val histo: MutableMap<String,UInt> = mutableMapOf()
    companion object {
        suspend fun from(flow: Flow<String>): Histogram {
            return Histogram().apply {
                flow.collect(this::add)
            }
        }
    }
    fun add(word: String) {
        histo.compute(word) { _, count ->
             count?.let { it + 1u } ?: 1u
        }
    }
    override fun iterator(): Iterator<Pair<String, UInt>> {
        return iterator {
            histo.forEach { (t, u) -> yield(t to u) }
        }
    }
 }
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt
@ -0,0 +1,16 @@
 package de.itkl.textprocessing
 import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
 import java.io.File
 import java.nio.file.Path
 class HistogramCsvStorage {
    suspend fun save(histogram: Histogram, file: File) {
        csvWriter().openAsync(file, append = false) {
        }
    }
    fun read(path: Path): Histogram {
        TODO()
    }
 }
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/ProgressInputStream.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/ProgressInputStream.kt
@ -0,0 +1,39 @@
 package de.itkl.textprocessing
 import java.io.InputStream
 /**
 * Represents an input stream that tracks the progress of reading from an underlying input stream.
 *
 * @property inputStream The underlying input stream to read from.
 * @property updateOp The operation to be executed when the number of bytes read changes.
 * @property bytesRead The number of bytes read from the input stream.
 */
 class ProgressInputStream(
    private val inputStream: InputStream,
    private val updateOp: (Long) -> Unit) : InputStream() {
    @Volatile
    var bytesRead: Long = 0
        private set(value) {
            field = value
            updateOp(value)
        }
    override fun read(): Int {
        val byte = inputStream.read()
        if (byte != -1) {
            bytesRead++
        }
        return byte
    }
    override fun read(b: ByteArray, off: Int, len: Int): Int {
        val bytesRead = inputStream.read(b, off, len)
        if (bytesRead != -1) {
            this.bytesRead += bytesRead
        }
        return bytesRead
    }
    override fun read(b: ByteArray): Int {
        return this.read(b, 0, b.size)
    }
 }
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt
@ -0,0 +1,30 @@
 package de.itkl.textprocessing
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.flow
 import kotlinx.coroutines.flow.onCompletion
 import org.apache.lucene.analysis.standard.StandardTokenizer
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 import org.apache.lucene.util.AttributeFactory
 import java.io.File
 import java.io.FileReader
 import java.io.InputStreamReader
 class TextFile(val file: File) {
    fun words(progressOp: (read: Long) -> Unit = {}): Flow<String> {
        val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
        val tokenizer = StandardTokenizer(factory)
        val reader = ProgressInputStream(file.inputStream(), progressOp)
        tokenizer.setReader(InputStreamReader(reader))
        tokenizer.reset()
        val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
        return flow {
            while (tokenizer.incrementToken()) {
                emit(attr.toString())
            }
        }.onCompletion {
            tokenizer.close()
        }
    }
 }
--- a/libraries/tfidf/build.gradle.kts
+++ b/libraries/tfidf/build.gradle.kts
@ -1,3 +1,8 @@
 plugins {
    id("docthor.kotlin-library-conventions")
 }
 dependencies {
    implementation(project(":libraries:textprocessing"))
    implementation("com.github.ajalt.mordant:mordant:2.2.0")
 }
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Language.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Language.kt
@ -0,0 +1,5 @@
 package de.itkl.tfidf
 enum class Language {
    DE
 }
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt
@ -0,0 +1,43 @@
 package de.itkl.tfidf
 import de.itkl.textprocessing.Histogram
 import de.itkl.textprocessing.TextFile
 import io.github.oshai.kotlinlogging.KotlinLogging
 import kotlinx.coroutines.flow.map
 import kotlinx.coroutines.flow.take
 import kotlinx.coroutines.withTimeoutOrNull
 import org.tartarus.snowball.SnowballStemmer
 import org.tartarus.snowball.ext.GermanStemmer
 import java.io.File
 private val Log = KotlinLogging.logger { }
 class TfIdf {
    suspend fun buildTfIdfDict(
        corpus: File,
        language: Language
    ) {
        Log.info { "Processing $corpus" }
        val stemmer = stemmer(language)
        val words = TextFile(corpus).words()
            .take(100)
            .map { stemmer.stem(it) }
        val histogram = Histogram.from(words)
        histogram.forEach { (word, count) ->
            println("$word\t$count")
        }
    }
    private fun stemmer(language: Language): SnowballStemmer {
        return when(language) {
            Language.DE -> GermanStemmer()
        }
    }
    private fun SnowballStemmer.stem(word: String): String {
        current = word
        stem()
        return current
    }
 }