maybe idf is correct now :D

develop
Timo Bryant 2023-12-21 19:19:19 +01:00
parent 81a30dd2f6
commit 3e5534f184
6 changed files with 84 additions and 10 deletions

View File

@ -2,8 +2,10 @@ package de.itkl.fileprocessing
interface ProgressBarFactory {
fun new(resource: Resource): ProgressBar
fun new(name: String, max: Long): ProgressBar
}
interface ProgressBar : AutoCloseable {
fun update(bytesRead: Long)
fun update(progressed: Long)
fun step()
}

View File

@ -40,12 +40,17 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
return this
}
fun add(word: String) {
histo.compute(word) { _, count ->
count?.let { it + 1u } ?: 1u
}
}
fun set(word: String, count: Int) {
histo[word] = count.toUInt()
}
val size get() = histo.size
override fun iterator(): Iterator<Pair<String, UInt>> {
return iterator {

View File

@ -14,6 +14,7 @@ import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.max
private val Log = KotlinLogging.logger { }
@ -25,14 +26,16 @@ class DocumentFrequency : FileProcessor, KoinComponent {
override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile()
val histogram = TextFile(resource.read())
val (numDocs, histogram) = TextFile(resource.read())
.splitByEmptyLines()
.parallelUnordered(this, 16) { doc ->
.withIndex()
.parallelUnordered(this, 16) { (index, doc) ->
val result = collectWordsOfDocument(doc)
result
index to result
}
.reduce { acc, other -> acc.join(other)}
.reduce { (index, acc), (otherIndex, other) -> max(index, otherIndex) to acc.join(other)}
Log.info { "Writing CSV $resultFile" }
histogram.set("\$numDocs", numDocs)
HistogramCsvStorage().save(histogram, resultFile)
resultFile
}

View File

@ -1,4 +1,50 @@
package de.itkl.tfidf
class InverseDocumentFrequency {
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.textprocessing.HistogramCsvStorage
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.ln
import kotlin.math.log
import kotlin.math.log10
import kotlin.math.log2
private val Log = KotlinLogging.logger { }
class InverseDocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
}
override suspend fun process(resource: Resource): File {
val histogram = HistogramCsvStorage().read(resource.toFile())
val numDocs = histogram
.find { (word, count) -> word == "\$numDocs" }!!
.second.toInt()
val progressBarFactory: ProgressBarFactory by inject()
var step: Long = 0;
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
writeRow("word", "idf")
histogram.forEach { (word, count) ->
writeRow(word, idf(numDocs, count))
progess.update(step++)
}
}
resource.path.toFile()
}
}
private fun idf(numDocs: Int, count: UInt): Double {
return log10(numDocs / count.toDouble())
}
}

View File

@ -19,6 +19,18 @@ class TerminalProgressBarFactory : ProgressBarFactory {
}
return TerminalProgressBar(animation, resource.length())
}
override fun new(name: String, max: Long): ProgressBar {
val animation = terminal.progressAnimation {
text(name)
percentage()
progressBar()
completed()
timeRemaining()
}
return TerminalProgressBar(animation, max)
}
}
class TerminalProgressBar(
@ -28,8 +40,12 @@ class TerminalProgressBar(
animation.start()
animation.updateTotal(total)
}
override fun update(bytesRead: Long) {
animation.update(bytesRead)
override fun update(progressed: Long) {
animation.update(progressed)
}
override fun step() {
animation.advance()
}
override fun close() {

View File

@ -1,11 +1,13 @@
package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
override val fileProcessor = listOf(
DocumentFrequency()
override val fileProcessor = listOf<FileProcessor>(
DocumentFrequency(),
InverseDocumentFrequency()
)
}