maybe idf is correct now :D

develop
Timo Bryant 2023-12-21 19:19:19 +01:00
parent 81a30dd2f6
commit 3e5534f184
6 changed files with 84 additions and 10 deletions

View File

@ -2,8 +2,10 @@ package de.itkl.fileprocessing
interface ProgressBarFactory { interface ProgressBarFactory {
fun new(resource: Resource): ProgressBar fun new(resource: Resource): ProgressBar
fun new(name: String, max: Long): ProgressBar
} }
interface ProgressBar : AutoCloseable { interface ProgressBar : AutoCloseable {
fun update(bytesRead: Long) fun update(progressed: Long)
fun step()
} }

View File

@ -40,12 +40,17 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
return this return this
} }
fun add(word: String) { fun add(word: String) {
histo.compute(word) { _, count -> histo.compute(word) { _, count ->
count?.let { it + 1u } ?: 1u count?.let { it + 1u } ?: 1u
} }
} }
fun set(word: String, count: Int) {
histo[word] = count.toUInt()
}
val size get() = histo.size val size get() = histo.size
override fun iterator(): Iterator<Pair<String, UInt>> { override fun iterator(): Iterator<Pair<String, UInt>> {
return iterator { return iterator {

View File

@ -14,6 +14,7 @@ import org.koin.core.component.inject
import java.io.File import java.io.File
import java.nio.file.Path import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension import kotlin.io.path.nameWithoutExtension
import kotlin.math.max
private val Log = KotlinLogging.logger { } private val Log = KotlinLogging.logger { }
@ -25,14 +26,16 @@ class DocumentFrequency : FileProcessor, KoinComponent {
override suspend fun process(resource: Resource): File = coroutineScope { override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path)}" } Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile() val resultFile = willProduce(resource.path).toFile()
val histogram = TextFile(resource.read()) val (numDocs, histogram) = TextFile(resource.read())
.splitByEmptyLines() .splitByEmptyLines()
.parallelUnordered(this, 16) { doc -> .withIndex()
.parallelUnordered(this, 16) { (index, doc) ->
val result = collectWordsOfDocument(doc) val result = collectWordsOfDocument(doc)
result index to result
} }
.reduce { acc, other -> acc.join(other)} .reduce { (index, acc), (otherIndex, other) -> max(index, otherIndex) to acc.join(other)}
Log.info { "Writing CSV $resultFile" } Log.info { "Writing CSV $resultFile" }
histogram.set("\$numDocs", numDocs)
HistogramCsvStorage().save(histogram, resultFile) HistogramCsvStorage().save(histogram, resultFile)
resultFile resultFile
} }

View File

@ -1,4 +1,50 @@
package de.itkl.tfidf package de.itkl.tfidf
class InverseDocumentFrequency { import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.textprocessing.HistogramCsvStorage
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.ln
import kotlin.math.log
import kotlin.math.log10
import kotlin.math.log2
private val Log = KotlinLogging.logger { }
class InverseDocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
}
override suspend fun process(resource: Resource): File {
val histogram = HistogramCsvStorage().read(resource.toFile())
val numDocs = histogram
.find { (word, count) -> word == "\$numDocs" }!!
.second.toInt()
val progressBarFactory: ProgressBarFactory by inject()
var step: Long = 0;
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
writeRow("word", "idf")
histogram.forEach { (word, count) ->
writeRow(word, idf(numDocs, count))
progess.update(step++)
}
}
resource.path.toFile()
}
}
private fun idf(numDocs: Int, count: UInt): Double {
return log10(numDocs / count.toDouble())
}
} }

View File

@ -19,6 +19,18 @@ class TerminalProgressBarFactory : ProgressBarFactory {
} }
return TerminalProgressBar(animation, resource.length()) return TerminalProgressBar(animation, resource.length())
} }
override fun new(name: String, max: Long): ProgressBar {
val animation = terminal.progressAnimation {
text(name)
percentage()
progressBar()
completed()
timeRemaining()
}
return TerminalProgressBar(animation, max)
}
} }
class TerminalProgressBar( class TerminalProgressBar(
@ -28,8 +40,12 @@ class TerminalProgressBar(
animation.start() animation.start()
animation.updateTotal(total) animation.updateTotal(total)
} }
override fun update(bytesRead: Long) { override fun update(progressed: Long) {
animation.update(bytesRead) animation.update(progressed)
}
override fun step() {
animation.advance()
} }
override fun close() { override fun close() {

View File

@ -1,11 +1,13 @@
package de.itkl.tfidf package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessingPipeline import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent import org.koin.core.component.KoinComponent
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) { class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
override val fileProcessor = listOf( override val fileProcessor = listOf<FileProcessor>(
DocumentFrequency() DocumentFrequency(),
InverseDocumentFrequency()
) )
} }