maybe idf is correct now :D
parent
81a30dd2f6
commit
3e5534f184
|
|
@ -2,8 +2,10 @@ package de.itkl.fileprocessing
|
|||
|
||||
interface ProgressBarFactory {
|
||||
fun new(resource: Resource): ProgressBar
|
||||
fun new(name: String, max: Long): ProgressBar
|
||||
}
|
||||
|
||||
interface ProgressBar : AutoCloseable {
|
||||
fun update(bytesRead: Long)
|
||||
fun update(progressed: Long)
|
||||
fun step()
|
||||
}
|
||||
|
|
@ -40,12 +40,17 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
|
|||
return this
|
||||
}
|
||||
|
||||
|
||||
fun add(word: String) {
|
||||
histo.compute(word) { _, count ->
|
||||
count?.let { it + 1u } ?: 1u
|
||||
}
|
||||
}
|
||||
|
||||
fun set(word: String, count: Int) {
|
||||
histo[word] = count.toUInt()
|
||||
}
|
||||
|
||||
val size get() = histo.size
|
||||
override fun iterator(): Iterator<Pair<String, UInt>> {
|
||||
return iterator {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ import org.koin.core.component.inject
|
|||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.nameWithoutExtension
|
||||
import kotlin.math.max
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
|
||||
|
|
@ -25,14 +26,16 @@ class DocumentFrequency : FileProcessor, KoinComponent {
|
|||
override suspend fun process(resource: Resource): File = coroutineScope {
|
||||
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
||||
val resultFile = willProduce(resource.path).toFile()
|
||||
val histogram = TextFile(resource.read())
|
||||
val (numDocs, histogram) = TextFile(resource.read())
|
||||
.splitByEmptyLines()
|
||||
.parallelUnordered(this, 16) { doc ->
|
||||
.withIndex()
|
||||
.parallelUnordered(this, 16) { (index, doc) ->
|
||||
val result = collectWordsOfDocument(doc)
|
||||
result
|
||||
index to result
|
||||
}
|
||||
.reduce { acc, other -> acc.join(other)}
|
||||
.reduce { (index, acc), (otherIndex, other) -> max(index, otherIndex) to acc.join(other)}
|
||||
Log.info { "Writing CSV $resultFile" }
|
||||
histogram.set("\$numDocs", numDocs)
|
||||
HistogramCsvStorage().save(histogram, resultFile)
|
||||
resultFile
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,50 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
class InverseDocumentFrequency {
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||
import de.itkl.fileprocessing.FileProcessor
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import de.itkl.fileprocessing.Resource
|
||||
import de.itkl.textprocessing.HistogramCsvStorage
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.nameWithoutExtension
|
||||
import kotlin.math.ln
|
||||
import kotlin.math.log
|
||||
import kotlin.math.log10
|
||||
import kotlin.math.log2
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
|
||||
class InverseDocumentFrequency : FileProcessor, KoinComponent {
|
||||
override fun willProduce(path: Path): Path {
|
||||
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
|
||||
}
|
||||
|
||||
override suspend fun process(resource: Resource): File {
|
||||
val histogram = HistogramCsvStorage().read(resource.toFile())
|
||||
val numDocs = histogram
|
||||
.find { (word, count) -> word == "\$numDocs" }!!
|
||||
.second.toInt()
|
||||
val progressBarFactory: ProgressBarFactory by inject()
|
||||
|
||||
|
||||
var step: Long = 0;
|
||||
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
|
||||
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
|
||||
writeRow("word", "idf")
|
||||
histogram.forEach { (word, count) ->
|
||||
writeRow(word, idf(numDocs, count))
|
||||
progess.update(step++)
|
||||
}
|
||||
}
|
||||
resource.path.toFile()
|
||||
}
|
||||
}
|
||||
|
||||
private fun idf(numDocs: Int, count: UInt): Double {
|
||||
return log10(numDocs / count.toDouble())
|
||||
}
|
||||
}
|
||||
|
|
@ -19,6 +19,18 @@ class TerminalProgressBarFactory : ProgressBarFactory {
|
|||
}
|
||||
return TerminalProgressBar(animation, resource.length())
|
||||
}
|
||||
|
||||
override fun new(name: String, max: Long): ProgressBar {
|
||||
val animation = terminal.progressAnimation {
|
||||
text(name)
|
||||
percentage()
|
||||
progressBar()
|
||||
completed()
|
||||
timeRemaining()
|
||||
}
|
||||
return TerminalProgressBar(animation, max)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class TerminalProgressBar(
|
||||
|
|
@ -28,8 +40,12 @@ class TerminalProgressBar(
|
|||
animation.start()
|
||||
animation.updateTotal(total)
|
||||
}
|
||||
override fun update(bytesRead: Long) {
|
||||
animation.update(bytesRead)
|
||||
override fun update(progressed: Long) {
|
||||
animation.update(progressed)
|
||||
}
|
||||
|
||||
override fun step() {
|
||||
animation.advance()
|
||||
}
|
||||
|
||||
override fun close() {
|
||||
|
|
|
|||
|
|
@ -1,11 +1,13 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import de.itkl.fileprocessing.FileProcessingPipeline
|
||||
import de.itkl.fileprocessing.FileProcessor
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import org.koin.core.component.KoinComponent
|
||||
|
||||
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
||||
override val fileProcessor = listOf(
|
||||
DocumentFrequency()
|
||||
override val fileProcessor = listOf<FileProcessor>(
|
||||
DocumentFrequency(),
|
||||
InverseDocumentFrequency()
|
||||
)
|
||||
}
|
||||
Loading…
Reference in New Issue