From 606837a76f7221267d4fcfc91ba1612680413a66 Mon Sep 17 00:00:00 2001 From: Timo Bryant Date: Thu, 21 Dec 2023 17:31:09 +0100 Subject: [PATCH] code cleanup --- app/src/main/kotlin/docthor/app/App.kt | 2 +- .../itkl/fileprocessing/ProcessingPipeline.kt | 6 +- .../processing/UnorderedParallelFlowMap.kt | 13 +++-- .../kotlin/de/itkl/textprocessing/TextFile.kt | 39 ++++--------- .../tfidf/{Idf.kt => DocumentFrequency.kt} | 12 +--- .../de/itkl/tfidf/InverseDocumentFrequency.kt | 4 ++ .../tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt | 9 --- .../src/main/kotlin/de/itkl/tfidf/TfIdf.kt | 55 ------------------- .../kotlin/de/itkl/tfidf/TfIdfPipeline.kt | 5 +- .../src/main/kotlin/de/itkl/tfidf/tui.kt | 21 ------- 10 files changed, 34 insertions(+), 132 deletions(-) rename libraries/tfidf/src/main/kotlin/de/itkl/tfidf/{Idf.kt => DocumentFrequency.kt} (83%) create mode 100644 libraries/tfidf/src/main/kotlin/de/itkl/tfidf/InverseDocumentFrequency.kt delete mode 100644 libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt delete mode 100644 libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt delete mode 100644 libraries/tfidf/src/main/kotlin/de/itkl/tfidf/tui.kt diff --git a/app/src/main/kotlin/docthor/app/App.kt b/app/src/main/kotlin/docthor/app/App.kt index e9b3077..ba7e0fa 100644 --- a/app/src/main/kotlin/docthor/app/App.kt +++ b/app/src/main/kotlin/docthor/app/App.kt @@ -22,7 +22,7 @@ class ComputeTf : CliktCommand() { .required() override fun run() = runBlocking { - TfIdfPipeline(language = Language.DE) + TfIdfPipeline(language = Language.DE, force = true) .input(corpus) // TextFile(corpus).splitByEmptyLines() // .take(10) diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt index 36039a6..436d3c3 100644 --- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt @@ -6,7 +6,9 @@ import kotlin.io.path.exists private val Log = KotlinLogging.logger { } -abstract class FileProcessingPipeline { +abstract class FileProcessingPipeline(private val force: Boolean = false) { + + protected abstract val fileProcessor: List protected abstract val progressBarFactory: ProgressBarFactory @@ -14,7 +16,7 @@ abstract class FileProcessingPipeline { var currentFile = file fileProcessor.forEach { processor -> val target = processor.willProduce(currentFile.toPath()) - if(target.exists()) { + if(target.exists() && !force) { Log.info { "$target exists. Skipping" } } else { Log.info { "$target does not exists. Creating" } diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/UnorderedParallelFlowMap.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/UnorderedParallelFlowMap.kt index 99de869..ae02292 100644 --- a/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/UnorderedParallelFlowMap.kt +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/UnorderedParallelFlowMap.kt @@ -1,12 +1,13 @@ package de.itkl.processing +import io.github.oshai.kotlinlogging.KotlinLogging import kotlinx.coroutines.* import kotlinx.coroutines.channels.Channel import kotlinx.coroutines.channels.consumeEach import kotlinx.coroutines.flow.* +private val Log = KotlinLogging.logger { } class ParallelUnorderedFlow( - private val producerJob: Job, private val mapperFlow: Flow ) : Flow { override suspend fun collect(collector: FlowCollector) { @@ -22,13 +23,15 @@ suspend fun Flow.parallelUnordered( val producerChannel = Channel() - val producersJob = scope.launch(Dispatchers.Default) { + scope.launch(Dispatchers.Default) { collect { - producerChannel.send(it)} + producerChannel.send(it) + } + producerChannel.close() } val mapperFlow = channelFlow { - (0..numWorkers).map { consumer -> + (0..numWorkers).map { launch(Dispatchers.Default) { producerChannel.consumeEach { send(mapperFn(it)) @@ -36,5 +39,5 @@ suspend fun Flow.parallelUnordered( } } } - return ParallelUnorderedFlow(producersJob, mapperFlow) + return ParallelUnorderedFlow(mapperFlow) } \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt index 0fbea75..0b0741c 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt @@ -11,37 +11,22 @@ import java.io.InputStream import java.io.InputStreamReader -class TextFile(val inputStream: InputStream) { - +class TextFile(private val inputStream: InputStream) { fun splitByEmptyLines(): Flow> { - val reader = InputStreamReader(inputStream) - var list = mutableListOf() - return flow { - reader.useLines { lines -> - lines.forEach { line -> - if(line.isEmpty()) { - emit(list) - list = mutableListOf() - } else { - list.add(line) + return InputStreamReader(inputStream).use { reader -> + var list = mutableListOf() + flow { + reader.useLines { lines -> + lines.forEach { line -> + if(line.isEmpty()) { + emit(list) + list = mutableListOf() + } else { + list.add(line) + } } } } } } -// fun words(progressOp: (read: Long) -> Unit = {}): Flow { -// val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY -// val tokenizer = StandardTokenizer(factory) -// val reader = ProgressInputStream(file.inputStream(), progressOp) -// tokenizer.setReader(InputStreamReader(reader)) -// tokenizer.reset() -// val attr = tokenizer.addAttribute(CharTermAttribute::class.java) -// return flow { -// while (kotlin.runCatching { tokenizer.incrementToken() }.getOrElse { true } ) { -// emit(attr.toString()) -// } -// }.onCompletion { -// tokenizer.close() -// } -// } } \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt similarity index 83% rename from libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt rename to libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt index 4dc52d7..1ad2aea 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt @@ -1,28 +1,21 @@ package de.itkl.tfidf -import com.github.ajalt.mordant.terminal.Terminal import de.itkl.fileprocessing.FileProcessor import de.itkl.fileprocessing.Resource import de.itkl.processing.parallelUnordered import de.itkl.textprocessing.* import io.github.oshai.kotlinlogging.KotlinLogging import kotlinx.coroutines.* -import kotlinx.coroutines.channels.Channel -import kotlinx.coroutines.channels.consumeEach import kotlinx.coroutines.flow.* -import kotlinx.coroutines.sync.Semaphore -import kotlinx.coroutines.sync.withPermit import java.io.File import java.nio.file.Path -import java.util.concurrent.atomic.AtomicInteger import kotlin.io.path.nameWithoutExtension private val Log = KotlinLogging.logger { } -class Idf : FileProcessor { - +class DocumentFrequency : FileProcessor { override fun willProduce(path: Path): Path { - return path.parent.resolve(path.nameWithoutExtension + "-idf.csv") + return path.parent.resolve(path.nameWithoutExtension + "-document-frequency.csv") } override suspend fun process(resource: Resource): File = coroutineScope { @@ -35,6 +28,7 @@ class Idf : FileProcessor { result } .reduce { acc, other -> acc.join(other)} + Log.info { "Writing CSV $resultFile" } HistogramCsvStorage().save(histogram, resultFile) resultFile } diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/InverseDocumentFrequency.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/InverseDocumentFrequency.kt new file mode 100644 index 0000000..61e12f4 --- /dev/null +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/InverseDocumentFrequency.kt @@ -0,0 +1,4 @@ +package de.itkl.tfidf + +class InverseDocumentFrequency { +} \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt deleted file mode 100644 index 38c4480..0000000 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Tf.kt +++ /dev/null @@ -1,9 +0,0 @@ -package de.itkl.tfidf - -import com.github.doyaaaaaken.kotlincsv.dsl.csvReader -import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter -import de.itkl.textprocessing.Histogram -import io.github.oshai.kotlinlogging.KotlinLogging -import java.io.File - -private val Log = KotlinLogging.logger { } diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt deleted file mode 100644 index e7ed0dd..0000000 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt +++ /dev/null @@ -1,55 +0,0 @@ -package de.itkl.tfidf - -import com.github.ajalt.mordant.terminal.Terminal -import de.itkl.textprocessing.Histogram -import de.itkl.textprocessing.HistogramCsvStorage -import de.itkl.textprocessing.TextFile -import io.github.oshai.kotlinlogging.KotlinLogging -import kotlinx.coroutines.flow.map -import org.tartarus.snowball.SnowballStemmer -import org.tartarus.snowball.ext.GermanStemmer -import java.io.File -import kotlin.io.path.exists - - -private val Log = KotlinLogging.logger { } -//class TfIdf { -// suspend fun computeTf( -// corpus: File, -// language: Language -// ): Histogram { -// Log.info { "Processing $corpus" } -// val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv") -// -// if(destination.exists()) { -// return HistogramCsvStorage().read(destination.toFile()) -// } -// -// val filesize = corpus.length() -// -// val t = Terminal() -// val histogram = t.progressBar("Indexing ${corpus.name}", filesize) { val stemmer = stemmer(language) -// val words = TextFile(corpus).words {readBytes -> update(readBytes)} -// .map { stemmer.stem(it) } -// Histogram.from(words) -// } -// -// t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) { -// HistogramCsvStorage() -// .save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)} -// } -// return histogram -// } -// -// private fun stemmer(language: Language): SnowballStemmer { -// return when(language) { -// Language.DE -> GermanStemmer() -// } -// } -// -// private fun SnowballStemmer.stem(word: String): String { -// current = word -// stem() -// return current -// } -//} \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt index ee7b4b5..d91312a 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt @@ -1,12 +1,11 @@ package de.itkl.tfidf import de.itkl.fileprocessing.FileProcessingPipeline -import de.itkl.fileprocessing.FileProcessor import de.itkl.fileprocessing.ProgressBarFactory -class TfIdfPipeline(private val language: Language) : FileProcessingPipeline() { +class TfIdfPipeline(private val language: Language, force: Boolean) : FileProcessingPipeline(force) { override val fileProcessor = listOf( - Idf() + DocumentFrequency() ) override val progressBarFactory: ProgressBarFactory get() = TerminalProgressBarFactory() diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/tui.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/tui.kt deleted file mode 100644 index e1b3a4b..0000000 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/tui.kt +++ /dev/null @@ -1,21 +0,0 @@ -package de.itkl.tfidf - -import com.github.ajalt.mordant.animation.ProgressAnimation -import com.github.ajalt.mordant.animation.progressAnimation -import com.github.ajalt.mordant.terminal.Terminal -import java.awt.SystemColor.text - -suspend fun Terminal.progressBar(name: String, overall: Long, context: suspend ProgressAnimation.() -> T):T { - val progress = progressAnimation { - text(name) - percentage() - progressBar() - completed() - timeRemaining() - } - progress.start() - progress.updateTotal(overall) - val result = context(progress) - progress.stop() - return result -}