rewriting IDF stuff

2023-12-17 17:46:51 +01:00 · 2023-12-17 17:46:51 +01:00 · d995b26459
parent ca51b50306
commit d995b26459
18 changed files with 314 additions and 39 deletions
--- a/app/src/main/kotlin/docthor/app/App.kt
+++ b/app/src/main/kotlin/docthor/app/App.kt
@ -6,10 +6,12 @@ import com.github.ajalt.clikt.parameters.options.option
 import com.github.ajalt.clikt.parameters.options.required
 import com.github.ajalt.clikt.parameters.types.enum
 import com.github.ajalt.clikt.parameters.types.file
 import de.itkl.textprocessing.TextFile
 import de.itkl.tfidf.Language
 import de.itkl.tfidf.TfIdf
 import de.itkl.tfidf.TfIdfPipeline
 import kotlinx.coroutines.flow.take
 import kotlinx.coroutines.runBlocking
 import java.io.File
 class ComputeTf : CliktCommand() {
    private val corpus  by option(help = "corpus")
@ -20,13 +22,17 @@ class ComputeTf : CliktCommand() {
        .required()
    override fun run() = runBlocking {
-        val tfIdf = TfIdf()
+        TfIdfPipeline(language = Language.DE)
-        val histogram = tfIdf.computeTf(
+            .input(corpus)
-            corpus,
+//        TextFile(corpus).splitByEmptyLines()
-            language
+//            .take(10)
-        )
+//            .collect { println(it) }
-        tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
+//        val tfIdf = TfIdf()
-
+//        val histogram = tfIdf.computeTf(
 //            corpus,
 //            language
 //        )
 //        val tf = tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
    }
 }
--- a/libraries/fileprocessing/build.gradle.kts
+++ b/libraries/fileprocessing/build.gradle.kts
@ -0,0 +1,7 @@
 plugins {
    id("docthor.kotlin-library-conventions")
 }
 dependencies {
    implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
 }
--- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/FileProcessor.kt
+++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/FileProcessor.kt
@ -0,0 +1,10 @@
 package de.itkl.fileprocessing
 import java.io.File
 import java.io.InputStream
 import java.nio.file.Path
 interface FileProcessor {
    fun willProduce(path: Path): Path
    suspend fun process(resource: Resource): File
 }
--- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt
+++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt
@ -0,0 +1,25 @@
 package de.itkl.fileprocessing
 import io.github.oshai.kotlinlogging.KotlinLogging
 import java.io.File
 import kotlin.io.path.exists
 private val Log = KotlinLogging.logger { }
 abstract class FileProcessingPipeline {
    protected abstract val fileProcessor: List<FileProcessor>
    suspend fun input(file: File) {
        var currentFile = file
        fileProcessor.forEach { processor ->
            val target = processor.willProduce(currentFile.toPath())
            if(target.exists()) {
                Log.info { "$target exists. Skipping" } 
            } else {
                Log.info { "$target does not exists. Creating" }
                processor.process(FileResource(currentFile))
            }
            currentFile = target.toFile()
        }
    }
 }
--- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressInputStream.kt
+++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressInputStream.kt
@ -0,0 +1,39 @@
 package de.itkl.fileprocessing
 import java.io.InputStream
 /**
 * Represents an input stream that tracks the progress of reading from an underlying input stream.
 *
 * @property inputStream The underlying input stream to read from.
 * @property updateOp The operation to be executed when the number of bytes read changes.
 * @property bytesRead The number of bytes read from the input stream.
 */
 class ProgressInputStream(
    private val inputStream: InputStream,
    private val updateOp: (Long) -> Unit) : InputStream() {
    @Volatile
    var bytesRead: Long = 0
        private set(value) {
            field = value
            updateOp(value)
        }
    override fun read(): Int {
        val byte = inputStream.read()
        if (byte != -1) {
            bytesRead++
        }
        return byte
    }
    override fun read(b: ByteArray, off: Int, len: Int): Int {
        val bytesRead = inputStream.read(b, off, len)
        if (bytesRead != -1) {
            this.bytesRead += bytesRead
        }
        return bytesRead
    }
    override fun read(b: ByteArray): Int {
        return this.read(b, 0, b.size)
    }
 }
--- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/Resource.kt
+++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/Resource.kt
@ -0,0 +1,34 @@
 package de.itkl.fileprocessing
 import java.io.File
 import java.io.InputStream
 import java.nio.file.Files
 import java.nio.file.Path
 interface Resource {
    val path: Path
    val size: Long
    fun toFile(): File = path.toFile()
    fun read(): InputStream
 }
 class ProgressResource(
    private val resource: Resource,
    private val progressOpSupplier: () -> (Long) -> Unit
 ) : Resource by resource
 {
    override fun read(): InputStream {
        return ProgressInputStream(
            read(),
            progressOpSupplier()
        )
    }
 }
 class FileResource(override val path: Path) : Resource {
    constructor(file: File): this(file.toPath())
    override val size: Long by lazy { path.toFile().length() }
    override fun read(): InputStream {
        return Files.newInputStream(path)
    }
 }
--- a/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/ParallelFlowProcessor.kt
+++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/ParallelFlowProcessor.kt
@ -0,0 +1,25 @@
 package de.itkl.processing
 import kotlinx.coroutines.async
 import kotlinx.coroutines.channels.Channel
 import kotlinx.coroutines.channels.consumeEach
 import kotlinx.coroutines.coroutineScope
 import kotlinx.coroutines.flow.*
 import kotlinx.coroutines.launch
 import kotlinx.coroutines.sync.Semaphore
 import kotlinx.coroutines.sync.withPermit
 import kotlin.coroutines.coroutineContext
 class ParallelFlowProcessor<T,U>(
    private val mapperFn: (T) -> U,
    private val concurrencyLimit: Int) {
    suspend fun process(flow: Flow<T>): Flow<U> = coroutineScope {
        val gate = Semaphore(concurrencyLimit)
        flow.map { item ->
            async {
                gate.withPermit { mapperFn(item) }
            }
        }
            .map { it.await() }
    }
 }
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/BagOfWords.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/BagOfWords.kt
@ -0,0 +1,33 @@
 package de.itkl.textprocessing
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.toSet
 class BagOfWords(private val data: MutableSet<String> = mutableSetOf()) : Iterable<String> {
    companion object {
        suspend fun from(flow: Flow<String>): BagOfWords {
            return BagOfWords(flow.toSet().toMutableSet())
        }
    }
    fun add(word: String) {
        data.add(word)
    }
    fun join(bagOfWords: BagOfWords): BagOfWords {
        return BagOfWords(data.toMutableSet().apply { addAll(bagOfWords.data) })
    }
    override fun iterator(): Iterator<String> {
        return iterator {
            data.forEach { yield(it) }
        }
    }
    override fun toString(): String {
        return data.joinToString(",")
    }
 }
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentFrequency.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/DocumentFrequency.kt
@ -0,0 +1,4 @@
 package de.itkl.textprocessing
 class DocumentFrequency {
 }
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt
@ -1,6 +1,6 @@
 package de.itkl.textprocessing
-import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.*
 class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : Iterable<Pair<String, UInt>>{
@ -11,6 +11,14 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
            }
        }
        suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
            val result = Histogram()
            flow.collect { bagOfWords ->
                bagOfWords.forEach(result::add)
            }
            return result
        }
        fun from(sequence: Sequence<Map<String, String>>): Histogram {
            val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
                .toMutableMap()
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt
@ -11,6 +11,23 @@ import java.io.InputStreamReader
 class TextFile(val file: File) {
    fun splitByEmptyLines(progressOp: (read: Long) -> Unit = {}): Flow<List<String>> {
        val reader = InputStreamReader(ProgressInputStream(file.inputStream(), progressOp))
        var list =  mutableListOf<String>()
        return flow {
            reader.useLines { lines ->
                lines.forEach { line ->
                    if(line.isEmpty()) {
                        emit(list)
                        list = mutableListOf()
                    } else {
                        list.add(line)
                    }
                }
            }
        }
    }
    fun words(progressOp: (read: Long) -> Unit = {}): Flow<String> {
        val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
        val tokenizer = StandardTokenizer(factory)
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Tokenizer.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Tokenizer.kt
@ -0,0 +1,44 @@
 package de.itkl.textprocessing
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.flow
 import kotlinx.coroutines.flow.onCompletion
 import org.apache.lucene.analysis.standard.StandardTokenizer
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 import org.apache.lucene.util.AttributeFactory
 import java.io.StringReader
 class Tokenizer : AutoCloseable{
    private val tokenizer by lazy {
        val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
        val tokenizer = StandardTokenizer(factory)
        tokenizer
 //        val reader = ProgressInputStream(file.inputStream(), progressOp)
 //        tokenizer.setReader(InputStreamReader(reader))
 //        tokenizer.reset()
 //        val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
 //        return flow {
 //            while (kotlin.runCatching {  tokenizer.incrementToken() }.getOrElse { true } ) {
 //                emit(attr.toString())
 //            }
 //        }.onCompletion {
 //            tokenizer.close()
 //        }
    }
    fun tokenize(input: String): Flow<String> {
        val reader = StringReader(input)
        tokenizer.setReader(reader)
        tokenizer.reset()
        val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
        return flow {
            while (kotlin.runCatching { tokenizer.incrementToken() }.getOrElse { true }) {
                emit(attr.toString())
            }
        }.onCompletion { tokenizer.close() }
    }
    override fun close() {
        tokenizer.close()
    }
 }
--- a/libraries/tfidf/build.gradle.kts
+++ b/libraries/tfidf/build.gradle.kts
@ -4,6 +4,7 @@ plugins {
 dependencies {
    api(project(":libraries:textprocessing"))
    api(project(":libraries:fileprocessing"))
    implementation("com.github.ajalt.mordant:mordant:2.2.0")
    implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
 }
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt
@ -0,0 +1,40 @@
 package de.itkl.tfidf
 import de.itkl.fileprocessing.FileProcessor
 import de.itkl.fileprocessing.Resource
 import de.itkl.textprocessing.*
 import io.github.oshai.kotlinlogging.KotlinLogging
 import kotlinx.coroutines.flow.map
 import kotlinx.coroutines.flow.reduce
 import kotlinx.coroutines.flow.take
 import java.io.File
 import java.nio.file.Path
 import kotlin.io.path.nameWithoutExtension
 private val Log = KotlinLogging.logger { }
 class Idf : FileProcessor {
    override fun willProduce(path: Path): Path {
        return path.parent.resolve(path.nameWithoutExtension + "-idf.csv")
    }
    override suspend fun process(resource: Resource): File {
        Log.info { "Would produce: ${willProduce(resource.path)}" }
        val resultFile = willProduce(resource.path).toFile()
        val textFile = TextFile(resource.toFile())
        val bagOfWords = textFile.splitByEmptyLines {  }
            .take(10)
            .map { document ->
                val tokenizer = Tokenizer()
                document.map { line ->
                    val tokens = tokenizer.tokenize(line)
                    BagOfWords.from(tokens)
                }
                    .reduce { acc, bagOfWords ->  acc.join(bagOfWords) }
            }
        val histogram = Histogram.fromBagOfWords(bagOfWords)
        HistogramCsvStorage().save(histogram, resultFile)
        return resultFile
    }
 }
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt
@ -1,31 +1,9 @@
 package de.itkl.tfidf
 import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
 import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
 import de.itkl.textprocessing.Histogram
 import io.github.oshai.kotlinlogging.KotlinLogging
 import java.io.File
 import kotlin.math.max
 private val Log = KotlinLogging.logger { }
 class Tf {
    private val data: MutableMap<String, Double> = mutableMapOf()
    fun update(histogram: Histogram): Tf {
        val max = histogram.maxOf { (_, count) -> count }
            .toDouble()
        histogram.forEach { (word, count) ->
            val tf = count.toDouble() / max
            data[word] = tf
        }
        return this
    }
    suspend fun saveToCsv(file: File) {
        csvWriter {}
            .openAsync(file, append = false) {
                writeRow("term", "frequency")
                data.forEach { (t, u) ->
                    writeRow(t, u)
                }
            }
    }
 }
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt
@ -41,13 +41,6 @@ class TfIdf {
        return histogram
    }
    suspend fun normalizeTf(histogram: Histogram, destination: File) {
        Log.info { "Write tf to $destination" }
        Tf()
            .update(histogram)
            .saveToCsv(destination)
    }
    private fun stemmer(language: Language): SnowballStemmer {
        return when(language) {
            Language.DE -> GermanStemmer()
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt
@ -0,0 +1,10 @@
 package de.itkl.tfidf
 import de.itkl.fileprocessing.FileProcessingPipeline
 import de.itkl.fileprocessing.FileProcessor
 class TfIdfPipeline(private val language: Language) : FileProcessingPipeline() {
    override val fileProcessor = listOf(
        Idf()
    )
 }
--- a/settings.gradle.kts
+++ b/settings.gradle.kts
@ -11,4 +11,5 @@ include(
    "app",
    "libraries:tfidf",
    "libraries:textprocessing",
    "libraries:fileprocessing",
 )