Add text processing and tfidf libraries
This commit introduces two new libraries: textprocessing and tfidf. The textprocessing library provides classes to read words from a text file, generate histogram from the words, and store the histogram to a CSV file. The tfidf library adds support for term frequency–inverse document frequency (tf-idf) computation using the functionalities provided by the textprocessing library.develop
parent
1259dc8764
commit
67d65cee93
|
|
@ -0,0 +1,9 @@
|
|||
plugins {
|
||||
id("docthor.kotlin-library-conventions")
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api("org.apache.lucene:lucene-analysis-common:9.9.0")
|
||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
package de.itkl.textprocessing
|
||||
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
|
||||
class Histogram : Iterable<Pair<String, UInt>>{
|
||||
private val histo: MutableMap<String,UInt> = mutableMapOf()
|
||||
|
||||
companion object {
|
||||
suspend fun from(flow: Flow<String>): Histogram {
|
||||
return Histogram().apply {
|
||||
flow.collect(this::add)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun add(word: String) {
|
||||
histo.compute(word) { _, count ->
|
||||
count?.let { it + 1u } ?: 1u
|
||||
}
|
||||
}
|
||||
|
||||
override fun iterator(): Iterator<Pair<String, UInt>> {
|
||||
return iterator {
|
||||
histo.forEach { (t, u) -> yield(t to u) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package de.itkl.textprocessing
|
||||
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
|
||||
class HistogramCsvStorage {
|
||||
|
||||
suspend fun save(histogram: Histogram, file: File) {
|
||||
csvWriter().openAsync(file, append = false) {
|
||||
}
|
||||
}
|
||||
fun read(path: Path): Histogram {
|
||||
TODO()
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
package de.itkl.textprocessing
|
||||
|
||||
import java.io.InputStream
|
||||
|
||||
/**
|
||||
* Represents an input stream that tracks the progress of reading from an underlying input stream.
|
||||
*
|
||||
* @property inputStream The underlying input stream to read from.
|
||||
* @property updateOp The operation to be executed when the number of bytes read changes.
|
||||
* @property bytesRead The number of bytes read from the input stream.
|
||||
*/
|
||||
class ProgressInputStream(
|
||||
private val inputStream: InputStream,
|
||||
private val updateOp: (Long) -> Unit) : InputStream() {
|
||||
@Volatile
|
||||
var bytesRead: Long = 0
|
||||
private set(value) {
|
||||
field = value
|
||||
updateOp(value)
|
||||
}
|
||||
|
||||
override fun read(): Int {
|
||||
val byte = inputStream.read()
|
||||
if (byte != -1) {
|
||||
bytesRead++
|
||||
}
|
||||
return byte
|
||||
}
|
||||
override fun read(b: ByteArray, off: Int, len: Int): Int {
|
||||
val bytesRead = inputStream.read(b, off, len)
|
||||
if (bytesRead != -1) {
|
||||
this.bytesRead += bytesRead
|
||||
}
|
||||
return bytesRead
|
||||
}
|
||||
override fun read(b: ByteArray): Int {
|
||||
return this.read(b, 0, b.size)
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
package de.itkl.textprocessing
|
||||
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
import kotlinx.coroutines.flow.flow
|
||||
import kotlinx.coroutines.flow.onCompletion
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
|
||||
import org.apache.lucene.util.AttributeFactory
|
||||
import java.io.File
|
||||
import java.io.FileReader
|
||||
import java.io.InputStreamReader
|
||||
|
||||
|
||||
class TextFile(val file: File) {
|
||||
fun words(progressOp: (read: Long) -> Unit = {}): Flow<String> {
|
||||
val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
|
||||
val tokenizer = StandardTokenizer(factory)
|
||||
val reader = ProgressInputStream(file.inputStream(), progressOp)
|
||||
tokenizer.setReader(InputStreamReader(reader))
|
||||
tokenizer.reset()
|
||||
val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
|
||||
return flow {
|
||||
while (tokenizer.incrementToken()) {
|
||||
emit(attr.toString())
|
||||
}
|
||||
}.onCompletion {
|
||||
tokenizer.close()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,3 +1,8 @@
|
|||
plugins {
|
||||
id("docthor.kotlin-library-conventions")
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation(project(":libraries:textprocessing"))
|
||||
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
enum class Language {
|
||||
DE
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import de.itkl.textprocessing.Histogram
|
||||
import de.itkl.textprocessing.TextFile
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import kotlinx.coroutines.flow.map
|
||||
import kotlinx.coroutines.flow.take
|
||||
import kotlinx.coroutines.withTimeoutOrNull
|
||||
import org.tartarus.snowball.SnowballStemmer
|
||||
import org.tartarus.snowball.ext.GermanStemmer
|
||||
import java.io.File
|
||||
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
class TfIdf {
|
||||
suspend fun buildTfIdfDict(
|
||||
corpus: File,
|
||||
language: Language
|
||||
) {
|
||||
Log.info { "Processing $corpus" }
|
||||
val stemmer = stemmer(language)
|
||||
val words = TextFile(corpus).words()
|
||||
.take(100)
|
||||
.map { stemmer.stem(it) }
|
||||
val histogram = Histogram.from(words)
|
||||
histogram.forEach { (word, count) ->
|
||||
println("$word\t$count")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private fun stemmer(language: Language): SnowballStemmer {
|
||||
return when(language) {
|
||||
Language.DE -> GermanStemmer()
|
||||
}
|
||||
}
|
||||
|
||||
private fun SnowballStemmer.stem(word: String): String {
|
||||
current = word
|
||||
stem()
|
||||
return current
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue