From 3e5534f1844a4a79f5171a72ead2cb75e7c9d176 Mon Sep 17 00:00:00 2001 From: Timo Bryant Date: Thu, 21 Dec 2023 19:19:19 +0100 Subject: [PATCH] maybe idf is correct now :D --- .../itkl/fileprocessing/ProgressBarFactory.kt | 4 +- .../de/itkl/textprocessing/Histogram.kt | 5 ++ .../kotlin/de/itkl/tfidf/DocumentFrequency.kt | 11 +++-- .../de/itkl/tfidf/InverseDocumentFrequency.kt | 48 ++++++++++++++++++- .../itkl/tfidf/TerminalProgressBarFactory.kt | 20 +++++++- .../kotlin/de/itkl/tfidf/TfIdfPipeline.kt | 6 ++- 6 files changed, 84 insertions(+), 10 deletions(-) diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressBarFactory.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressBarFactory.kt index af0e176..3da49f9 100644 --- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressBarFactory.kt +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressBarFactory.kt @@ -2,8 +2,10 @@ package de.itkl.fileprocessing interface ProgressBarFactory { fun new(resource: Resource): ProgressBar + fun new(name: String, max: Long): ProgressBar } interface ProgressBar : AutoCloseable { - fun update(bytesRead: Long) + fun update(progressed: Long) + fun step() } \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt index 0df3a78..340f225 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt @@ -40,12 +40,17 @@ class Histogram(private val histo: MutableMap = mutableMapOf()) : I return this } + fun add(word: String) { histo.compute(word) { _, count -> count?.let { it + 1u } ?: 1u } } + fun set(word: String, count: Int) { + histo[word] = count.toUInt() + } + val size get() = histo.size override fun iterator(): Iterator> { return iterator { diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt index 8b4b2dd..263c9cc 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt @@ -14,6 +14,7 @@ import org.koin.core.component.inject import java.io.File import java.nio.file.Path import kotlin.io.path.nameWithoutExtension +import kotlin.math.max private val Log = KotlinLogging.logger { } @@ -25,14 +26,16 @@ class DocumentFrequency : FileProcessor, KoinComponent { override suspend fun process(resource: Resource): File = coroutineScope { Log.info { "Would produce: ${willProduce(resource.path)}" } val resultFile = willProduce(resource.path).toFile() - val histogram = TextFile(resource.read()) + val (numDocs, histogram) = TextFile(resource.read()) .splitByEmptyLines() - .parallelUnordered(this, 16) { doc -> + .withIndex() + .parallelUnordered(this, 16) { (index, doc) -> val result = collectWordsOfDocument(doc) - result + index to result } - .reduce { acc, other -> acc.join(other)} + .reduce { (index, acc), (otherIndex, other) -> max(index, otherIndex) to acc.join(other)} Log.info { "Writing CSV $resultFile" } + histogram.set("\$numDocs", numDocs) HistogramCsvStorage().save(histogram, resultFile) resultFile } diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/InverseDocumentFrequency.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/InverseDocumentFrequency.kt index 61e12f4..6b73d94 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/InverseDocumentFrequency.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/InverseDocumentFrequency.kt @@ -1,4 +1,50 @@ package de.itkl.tfidf -class InverseDocumentFrequency { +import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter +import de.itkl.fileprocessing.FileProcessor +import de.itkl.fileprocessing.ProgressBarFactory +import de.itkl.fileprocessing.Resource +import de.itkl.textprocessing.HistogramCsvStorage +import io.github.oshai.kotlinlogging.KotlinLogging +import org.koin.core.component.KoinComponent +import org.koin.core.component.inject +import java.io.File +import java.nio.file.Path +import kotlin.io.path.nameWithoutExtension +import kotlin.math.ln +import kotlin.math.log +import kotlin.math.log10 +import kotlin.math.log2 + +private val Log = KotlinLogging.logger { } + +class InverseDocumentFrequency : FileProcessor, KoinComponent { + override fun willProduce(path: Path): Path { + return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv") + } + + override suspend fun process(resource: Resource): File { + val histogram = HistogramCsvStorage().read(resource.toFile()) + val numDocs = histogram + .find { (word, count) -> word == "\$numDocs" }!! + .second.toInt() + val progressBarFactory: ProgressBarFactory by inject() + + + var step: Long = 0; + return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess -> + csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) { + writeRow("word", "idf") + histogram.forEach { (word, count) -> + writeRow(word, idf(numDocs, count)) + progess.update(step++) + } + } + resource.path.toFile() + } + } + + private fun idf(numDocs: Int, count: UInt): Double { + return log10(numDocs / count.toDouble()) + } } \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt index e53d6bd..0f63f46 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt @@ -19,6 +19,18 @@ class TerminalProgressBarFactory : ProgressBarFactory { } return TerminalProgressBar(animation, resource.length()) } + + override fun new(name: String, max: Long): ProgressBar { + val animation = terminal.progressAnimation { + text(name) + percentage() + progressBar() + completed() + timeRemaining() + } + return TerminalProgressBar(animation, max) + } + } class TerminalProgressBar( @@ -28,8 +40,12 @@ class TerminalProgressBar( animation.start() animation.updateTotal(total) } - override fun update(bytesRead: Long) { - animation.update(bytesRead) + override fun update(progressed: Long) { + animation.update(progressed) + } + + override fun step() { + animation.advance() } override fun close() { diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt index 41922fc..2d4ff73 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt @@ -1,11 +1,13 @@ package de.itkl.tfidf import de.itkl.fileprocessing.FileProcessingPipeline +import de.itkl.fileprocessing.FileProcessor import de.itkl.fileprocessing.ProgressBarFactory import org.koin.core.component.KoinComponent class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) { - override val fileProcessor = listOf( - DocumentFrequency() + override val fileProcessor = listOf( + DocumentFrequency(), + InverseDocumentFrequency() ) } \ No newline at end of file