Add text processing and tfidf libraries

This commit introduces two new libraries: textprocessing and tfidf. The textprocessing library provides classes to read words from a text file, generate histogram from the words, and store the histogram to a CSV file. The tfidf library adds support for term frequency–inverse document frequency (tf-idf) computation using the functionalities provided by the textprocessing library.
develop
Timo Bryant 2023-12-15 17:17:27 +01:00
parent 1259dc8764
commit 67d65cee93
8 changed files with 174 additions and 0 deletions

View File

@ -0,0 +1,9 @@
plugins {
id("docthor.kotlin-library-conventions")
}
dependencies {
api("org.apache.lucene:lucene-analysis-common:9.9.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
}

View File

@ -0,0 +1,27 @@
package de.itkl.textprocessing
import kotlinx.coroutines.flow.Flow
class Histogram : Iterable<Pair<String, UInt>>{
private val histo: MutableMap<String,UInt> = mutableMapOf()
companion object {
suspend fun from(flow: Flow<String>): Histogram {
return Histogram().apply {
flow.collect(this::add)
}
}
}
fun add(word: String) {
histo.compute(word) { _, count ->
count?.let { it + 1u } ?: 1u
}
}
override fun iterator(): Iterator<Pair<String, UInt>> {
return iterator {
histo.forEach { (t, u) -> yield(t to u) }
}
}
}

View File

@ -0,0 +1,16 @@
package de.itkl.textprocessing
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import java.io.File
import java.nio.file.Path
class HistogramCsvStorage {
suspend fun save(histogram: Histogram, file: File) {
csvWriter().openAsync(file, append = false) {
}
}
fun read(path: Path): Histogram {
TODO()
}
}

View File

@ -0,0 +1,39 @@
package de.itkl.textprocessing
import java.io.InputStream
/**
* Represents an input stream that tracks the progress of reading from an underlying input stream.
*
* @property inputStream The underlying input stream to read from.
* @property updateOp The operation to be executed when the number of bytes read changes.
* @property bytesRead The number of bytes read from the input stream.
*/
class ProgressInputStream(
private val inputStream: InputStream,
private val updateOp: (Long) -> Unit) : InputStream() {
@Volatile
var bytesRead: Long = 0
private set(value) {
field = value
updateOp(value)
}
override fun read(): Int {
val byte = inputStream.read()
if (byte != -1) {
bytesRead++
}
return byte
}
override fun read(b: ByteArray, off: Int, len: Int): Int {
val bytesRead = inputStream.read(b, off, len)
if (bytesRead != -1) {
this.bytesRead += bytesRead
}
return bytesRead
}
override fun read(b: ByteArray): Int {
return this.read(b, 0, b.size)
}
}

View File

@ -0,0 +1,30 @@
package de.itkl.textprocessing
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.onCompletion
import org.apache.lucene.analysis.standard.StandardTokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.AttributeFactory
import java.io.File
import java.io.FileReader
import java.io.InputStreamReader
class TextFile(val file: File) {
fun words(progressOp: (read: Long) -> Unit = {}): Flow<String> {
val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
val tokenizer = StandardTokenizer(factory)
val reader = ProgressInputStream(file.inputStream(), progressOp)
tokenizer.setReader(InputStreamReader(reader))
tokenizer.reset()
val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
return flow {
while (tokenizer.incrementToken()) {
emit(attr.toString())
}
}.onCompletion {
tokenizer.close()
}
}
}

View File

@ -1,3 +1,8 @@
plugins {
id("docthor.kotlin-library-conventions")
}
dependencies {
implementation(project(":libraries:textprocessing"))
implementation("com.github.ajalt.mordant:mordant:2.2.0")
}

View File

@ -0,0 +1,5 @@
package de.itkl.tfidf
enum class Language {
DE
}

View File

@ -0,0 +1,43 @@
package de.itkl.tfidf
import de.itkl.textprocessing.Histogram
import de.itkl.textprocessing.TextFile
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.take
import kotlinx.coroutines.withTimeoutOrNull
import org.tartarus.snowball.SnowballStemmer
import org.tartarus.snowball.ext.GermanStemmer
import java.io.File
private val Log = KotlinLogging.logger { }
class TfIdf {
suspend fun buildTfIdfDict(
corpus: File,
language: Language
) {
Log.info { "Processing $corpus" }
val stemmer = stemmer(language)
val words = TextFile(corpus).words()
.take(100)
.map { stemmer.stem(it) }
val histogram = Histogram.from(words)
histogram.forEach { (word, count) ->
println("$word\t$count")
}
}
private fun stemmer(language: Language): SnowballStemmer {
return when(language) {
Language.DE -> GermanStemmer()
}
}
private fun SnowballStemmer.stem(word: String): String {
current = word
stem()
return current
}
}