maybe idf is correct now :D
parent
81a30dd2f6
commit
3e5534f184
|
|
@ -2,8 +2,10 @@ package de.itkl.fileprocessing
|
||||||
|
|
||||||
interface ProgressBarFactory {
|
interface ProgressBarFactory {
|
||||||
fun new(resource: Resource): ProgressBar
|
fun new(resource: Resource): ProgressBar
|
||||||
|
fun new(name: String, max: Long): ProgressBar
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ProgressBar : AutoCloseable {
|
interface ProgressBar : AutoCloseable {
|
||||||
fun update(bytesRead: Long)
|
fun update(progressed: Long)
|
||||||
|
fun step()
|
||||||
}
|
}
|
||||||
|
|
@ -40,12 +40,17 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
|
||||||
return this
|
return this
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
fun add(word: String) {
|
fun add(word: String) {
|
||||||
histo.compute(word) { _, count ->
|
histo.compute(word) { _, count ->
|
||||||
count?.let { it + 1u } ?: 1u
|
count?.let { it + 1u } ?: 1u
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fun set(word: String, count: Int) {
|
||||||
|
histo[word] = count.toUInt()
|
||||||
|
}
|
||||||
|
|
||||||
val size get() = histo.size
|
val size get() = histo.size
|
||||||
override fun iterator(): Iterator<Pair<String, UInt>> {
|
override fun iterator(): Iterator<Pair<String, UInt>> {
|
||||||
return iterator {
|
return iterator {
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ import org.koin.core.component.inject
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.nameWithoutExtension
|
import kotlin.io.path.nameWithoutExtension
|
||||||
|
import kotlin.math.max
|
||||||
|
|
||||||
private val Log = KotlinLogging.logger { }
|
private val Log = KotlinLogging.logger { }
|
||||||
|
|
||||||
|
|
@ -25,14 +26,16 @@ class DocumentFrequency : FileProcessor, KoinComponent {
|
||||||
override suspend fun process(resource: Resource): File = coroutineScope {
|
override suspend fun process(resource: Resource): File = coroutineScope {
|
||||||
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
||||||
val resultFile = willProduce(resource.path).toFile()
|
val resultFile = willProduce(resource.path).toFile()
|
||||||
val histogram = TextFile(resource.read())
|
val (numDocs, histogram) = TextFile(resource.read())
|
||||||
.splitByEmptyLines()
|
.splitByEmptyLines()
|
||||||
.parallelUnordered(this, 16) { doc ->
|
.withIndex()
|
||||||
|
.parallelUnordered(this, 16) { (index, doc) ->
|
||||||
val result = collectWordsOfDocument(doc)
|
val result = collectWordsOfDocument(doc)
|
||||||
result
|
index to result
|
||||||
}
|
}
|
||||||
.reduce { acc, other -> acc.join(other)}
|
.reduce { (index, acc), (otherIndex, other) -> max(index, otherIndex) to acc.join(other)}
|
||||||
Log.info { "Writing CSV $resultFile" }
|
Log.info { "Writing CSV $resultFile" }
|
||||||
|
histogram.set("\$numDocs", numDocs)
|
||||||
HistogramCsvStorage().save(histogram, resultFile)
|
HistogramCsvStorage().save(histogram, resultFile)
|
||||||
resultFile
|
resultFile
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,50 @@
|
||||||
package de.itkl.tfidf
|
package de.itkl.tfidf
|
||||||
|
|
||||||
class InverseDocumentFrequency {
|
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||||
|
import de.itkl.fileprocessing.FileProcessor
|
||||||
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
|
import de.itkl.fileprocessing.Resource
|
||||||
|
import de.itkl.textprocessing.HistogramCsvStorage
|
||||||
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
|
import org.koin.core.component.KoinComponent
|
||||||
|
import org.koin.core.component.inject
|
||||||
|
import java.io.File
|
||||||
|
import java.nio.file.Path
|
||||||
|
import kotlin.io.path.nameWithoutExtension
|
||||||
|
import kotlin.math.ln
|
||||||
|
import kotlin.math.log
|
||||||
|
import kotlin.math.log10
|
||||||
|
import kotlin.math.log2
|
||||||
|
|
||||||
|
private val Log = KotlinLogging.logger { }
|
||||||
|
|
||||||
|
class InverseDocumentFrequency : FileProcessor, KoinComponent {
|
||||||
|
override fun willProduce(path: Path): Path {
|
||||||
|
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
|
||||||
|
}
|
||||||
|
|
||||||
|
override suspend fun process(resource: Resource): File {
|
||||||
|
val histogram = HistogramCsvStorage().read(resource.toFile())
|
||||||
|
val numDocs = histogram
|
||||||
|
.find { (word, count) -> word == "\$numDocs" }!!
|
||||||
|
.second.toInt()
|
||||||
|
val progressBarFactory: ProgressBarFactory by inject()
|
||||||
|
|
||||||
|
|
||||||
|
var step: Long = 0;
|
||||||
|
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
|
||||||
|
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
|
||||||
|
writeRow("word", "idf")
|
||||||
|
histogram.forEach { (word, count) ->
|
||||||
|
writeRow(word, idf(numDocs, count))
|
||||||
|
progess.update(step++)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resource.path.toFile()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun idf(numDocs: Int, count: UInt): Double {
|
||||||
|
return log10(numDocs / count.toDouble())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -19,6 +19,18 @@ class TerminalProgressBarFactory : ProgressBarFactory {
|
||||||
}
|
}
|
||||||
return TerminalProgressBar(animation, resource.length())
|
return TerminalProgressBar(animation, resource.length())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
override fun new(name: String, max: Long): ProgressBar {
|
||||||
|
val animation = terminal.progressAnimation {
|
||||||
|
text(name)
|
||||||
|
percentage()
|
||||||
|
progressBar()
|
||||||
|
completed()
|
||||||
|
timeRemaining()
|
||||||
|
}
|
||||||
|
return TerminalProgressBar(animation, max)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class TerminalProgressBar(
|
class TerminalProgressBar(
|
||||||
|
|
@ -28,8 +40,12 @@ class TerminalProgressBar(
|
||||||
animation.start()
|
animation.start()
|
||||||
animation.updateTotal(total)
|
animation.updateTotal(total)
|
||||||
}
|
}
|
||||||
override fun update(bytesRead: Long) {
|
override fun update(progressed: Long) {
|
||||||
animation.update(bytesRead)
|
animation.update(progressed)
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun step() {
|
||||||
|
animation.advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun close() {
|
override fun close() {
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,13 @@
|
||||||
package de.itkl.tfidf
|
package de.itkl.tfidf
|
||||||
|
|
||||||
import de.itkl.fileprocessing.FileProcessingPipeline
|
import de.itkl.fileprocessing.FileProcessingPipeline
|
||||||
|
import de.itkl.fileprocessing.FileProcessor
|
||||||
import de.itkl.fileprocessing.ProgressBarFactory
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
|
|
||||||
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
||||||
override val fileProcessor = listOf(
|
override val fileProcessor = listOf<FileProcessor>(
|
||||||
DocumentFrequency()
|
DocumentFrequency(),
|
||||||
|
InverseDocumentFrequency()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue