From 67d65cee93c5c0b4b76889f469a43b19c2e739fd Mon Sep 17 00:00:00 2001 From: Timo Bryant Date: Fri, 15 Dec 2023 17:17:27 +0100 Subject: [PATCH] Add text processing and tfidf libraries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces two new libraries: textprocessing and tfidf. The textprocessing library provides classes to read words from a text file, generate histogram from the words, and store the histogram to a CSV file. The tfidf library adds support for term frequency–inverse document frequency (tf-idf) computation using the functionalities provided by the textprocessing library. --- libraries/textprocessing/build.gradle.kts | 9 ++++ .../de/itkl/textprocessing/Histogram.kt | 27 ++++++++++++ .../textprocessing/HistogramCsvStorage.kt | 16 +++++++ .../textprocessing/ProgressInputStream.kt | 39 +++++++++++++++++ .../kotlin/de/itkl/textprocessing/TextFile.kt | 30 +++++++++++++ libraries/tfidf/build.gradle.kts | 5 +++ .../src/main/kotlin/de/itkl/tfidf/Language.kt | 5 +++ .../src/main/kotlin/de/itkl/tfidf/TfIdf.kt | 43 +++++++++++++++++++ 8 files changed, 174 insertions(+) create mode 100644 libraries/textprocessing/build.gradle.kts create mode 100644 libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt create mode 100644 libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt create mode 100644 libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/ProgressInputStream.kt create mode 100644 libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt create mode 100644 libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Language.kt create mode 100644 libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt diff --git a/libraries/textprocessing/build.gradle.kts b/libraries/textprocessing/build.gradle.kts new file mode 100644 index 0000000..3983b7d --- /dev/null +++ b/libraries/textprocessing/build.gradle.kts @@ -0,0 +1,9 @@ +plugins { + id("docthor.kotlin-library-conventions") +} + +dependencies { + api("org.apache.lucene:lucene-analysis-common:9.9.0") + implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2") +} + diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt new file mode 100644 index 0000000..861927c --- /dev/null +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt @@ -0,0 +1,27 @@ +package de.itkl.textprocessing + +import kotlinx.coroutines.flow.Flow + +class Histogram : Iterable>{ + private val histo: MutableMap = mutableMapOf() + + companion object { + suspend fun from(flow: Flow): Histogram { + return Histogram().apply { + flow.collect(this::add) + } + } + } + + fun add(word: String) { + histo.compute(word) { _, count -> + count?.let { it + 1u } ?: 1u + } + } + + override fun iterator(): Iterator> { + return iterator { + histo.forEach { (t, u) -> yield(t to u) } + } + } +} \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt new file mode 100644 index 0000000..1e47430 --- /dev/null +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/HistogramCsvStorage.kt @@ -0,0 +1,16 @@ +package de.itkl.textprocessing + +import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter +import java.io.File +import java.nio.file.Path + +class HistogramCsvStorage { + + suspend fun save(histogram: Histogram, file: File) { + csvWriter().openAsync(file, append = false) { + } + } + fun read(path: Path): Histogram { + TODO() + } +} \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/ProgressInputStream.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/ProgressInputStream.kt new file mode 100644 index 0000000..36ce538 --- /dev/null +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/ProgressInputStream.kt @@ -0,0 +1,39 @@ +package de.itkl.textprocessing + +import java.io.InputStream + +/** + * Represents an input stream that tracks the progress of reading from an underlying input stream. + * + * @property inputStream The underlying input stream to read from. + * @property updateOp The operation to be executed when the number of bytes read changes. + * @property bytesRead The number of bytes read from the input stream. + */ +class ProgressInputStream( + private val inputStream: InputStream, + private val updateOp: (Long) -> Unit) : InputStream() { + @Volatile + var bytesRead: Long = 0 + private set(value) { + field = value + updateOp(value) + } + + override fun read(): Int { + val byte = inputStream.read() + if (byte != -1) { + bytesRead++ + } + return byte + } + override fun read(b: ByteArray, off: Int, len: Int): Int { + val bytesRead = inputStream.read(b, off, len) + if (bytesRead != -1) { + this.bytesRead += bytesRead + } + return bytesRead + } + override fun read(b: ByteArray): Int { + return this.read(b, 0, b.size) + } +} \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt new file mode 100644 index 0000000..d6cb02c --- /dev/null +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt @@ -0,0 +1,30 @@ +package de.itkl.textprocessing + +import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.flow +import kotlinx.coroutines.flow.onCompletion +import org.apache.lucene.analysis.standard.StandardTokenizer +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute +import org.apache.lucene.util.AttributeFactory +import java.io.File +import java.io.FileReader +import java.io.InputStreamReader + + +class TextFile(val file: File) { + fun words(progressOp: (read: Long) -> Unit = {}): Flow { + val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY + val tokenizer = StandardTokenizer(factory) + val reader = ProgressInputStream(file.inputStream(), progressOp) + tokenizer.setReader(InputStreamReader(reader)) + tokenizer.reset() + val attr = tokenizer.addAttribute(CharTermAttribute::class.java) + return flow { + while (tokenizer.incrementToken()) { + emit(attr.toString()) + } + }.onCompletion { + tokenizer.close() + } + } +} \ No newline at end of file diff --git a/libraries/tfidf/build.gradle.kts b/libraries/tfidf/build.gradle.kts index 1d52d1f..438770d 100644 --- a/libraries/tfidf/build.gradle.kts +++ b/libraries/tfidf/build.gradle.kts @@ -1,3 +1,8 @@ plugins { id("docthor.kotlin-library-conventions") } + +dependencies { + implementation(project(":libraries:textprocessing")) + implementation("com.github.ajalt.mordant:mordant:2.2.0") +} diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Language.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Language.kt new file mode 100644 index 0000000..a5fa7c9 --- /dev/null +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Language.kt @@ -0,0 +1,5 @@ +package de.itkl.tfidf + +enum class Language { + DE +} \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt new file mode 100644 index 0000000..a6075e9 --- /dev/null +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt @@ -0,0 +1,43 @@ +package de.itkl.tfidf + +import de.itkl.textprocessing.Histogram +import de.itkl.textprocessing.TextFile +import io.github.oshai.kotlinlogging.KotlinLogging +import kotlinx.coroutines.flow.map +import kotlinx.coroutines.flow.take +import kotlinx.coroutines.withTimeoutOrNull +import org.tartarus.snowball.SnowballStemmer +import org.tartarus.snowball.ext.GermanStemmer +import java.io.File + + +private val Log = KotlinLogging.logger { } +class TfIdf { + suspend fun buildTfIdfDict( + corpus: File, + language: Language + ) { + Log.info { "Processing $corpus" } + val stemmer = stemmer(language) + val words = TextFile(corpus).words() + .take(100) + .map { stemmer.stem(it) } + val histogram = Histogram.from(words) + histogram.forEach { (word, count) -> + println("$word\t$count") + } + + } + + private fun stemmer(language: Language): SnowballStemmer { + return when(language) { + Language.DE -> GermanStemmer() + } + } + + private fun SnowballStemmer.stem(word: String): String { + current = word + stem() + return current + } +} \ No newline at end of file