diff --git a/app/src/main/kotlin/docthor/app/App.kt b/app/src/main/kotlin/docthor/app/App.kt index 388d7b7..e9b3077 100644 --- a/app/src/main/kotlin/docthor/app/App.kt +++ b/app/src/main/kotlin/docthor/app/App.kt @@ -8,7 +8,7 @@ import com.github.ajalt.clikt.parameters.types.enum import com.github.ajalt.clikt.parameters.types.file import de.itkl.textprocessing.TextFile import de.itkl.tfidf.Language -import de.itkl.tfidf.TfIdf +//import de.itkl.tfidf.TfIdf import de.itkl.tfidf.TfIdfPipeline import kotlinx.coroutines.flow.take import kotlinx.coroutines.runBlocking diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt index 04a7872..ad8fe2a 100644 --- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt @@ -9,6 +9,7 @@ private val Log = KotlinLogging.logger { } abstract class FileProcessingPipeline { protected abstract val fileProcessor: List + protected abstract val progressBarFactory: ProgressBarFactory suspend fun input(file: File) { var currentFile = file fileProcessor.forEach { processor -> @@ -17,7 +18,9 @@ abstract class FileProcessingPipeline { Log.info { "$target exists. Skipping" } } else { Log.info { "$target does not exists. Creating" } - processor.process(FileResource(currentFile)) + val resource = FileResource(currentFile) + val progress = ProgressResource(resource, progressBarFactory) + processor.process(progress) } currentFile = target.toFile() } diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressBarFactory.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressBarFactory.kt new file mode 100644 index 0000000..af0e176 --- /dev/null +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressBarFactory.kt @@ -0,0 +1,9 @@ +package de.itkl.fileprocessing + +interface ProgressBarFactory { + fun new(resource: Resource): ProgressBar +} + +interface ProgressBar : AutoCloseable { + fun update(bytesRead: Long) +} \ No newline at end of file diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressInputStream.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressInputStream.kt index fceb3bb..385a9b5 100644 --- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressInputStream.kt +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProgressInputStream.kt @@ -11,12 +11,12 @@ import java.io.InputStream */ class ProgressInputStream( private val inputStream: InputStream, - private val updateOp: (Long) -> Unit) : InputStream() { + private val progressBar: ProgressBar) : InputStream() { @Volatile var bytesRead: Long = 0 private set(value) { field = value - updateOp(value) + progressBar.update(value) } override fun read(): Int { @@ -36,4 +36,9 @@ class ProgressInputStream( override fun read(b: ByteArray): Int { return this.read(b, 0, b.size) } + + override fun close() { + progressBar.close() + super.close() + } } \ No newline at end of file diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/Resource.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/Resource.kt index 4955bdd..3d179c5 100644 --- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/Resource.kt +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/Resource.kt @@ -4,23 +4,28 @@ import java.io.File import java.io.InputStream import java.nio.file.Files import java.nio.file.Path +import kotlin.io.path.name interface Resource { val path: Path val size: Long + val filename: String fun toFile(): File = path.toFile() + + fun length() = path.toFile().length() + fun read(): InputStream } class ProgressResource( private val resource: Resource, - private val progressOpSupplier: () -> (Long) -> Unit + private val progressBarFactory: ProgressBarFactory ) : Resource by resource { override fun read(): InputStream { return ProgressInputStream( - read(), - progressOpSupplier() + resource.read(), + progressBarFactory.new(this) ) } } @@ -28,6 +33,9 @@ class ProgressResource( class FileResource(override val path: Path) : Resource { constructor(file: File): this(file.toPath()) override val size: Long by lazy { path.toFile().length() } + override val filename: String + get() = path.name + override fun read(): InputStream { return Files.newInputStream(path) } diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/ParallelFlowProcessor.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/ParallelFlowProcessor.kt index 0318753..284304c 100644 --- a/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/ParallelFlowProcessor.kt +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/ParallelFlowProcessor.kt @@ -5,6 +5,7 @@ import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.flow import kotlinx.coroutines.flow.map import kotlinx.coroutines.flow.toList +import kotlinx.coroutines.runBlocking import kotlinx.coroutines.withContext import java.util.concurrent.Executors import java.util.concurrent.TimeUnit @@ -19,7 +20,10 @@ class ParallelFlowProcessor( suspend fun process(flow: Flow): Flow { return flow { - flow.map { kotlinx.coroutines.Runnable { mapperFn(it) } } + flow.map { kotlinx.coroutines.Runnable { + val result = mapperFn(it) + runBlocking { emit(result) } + } } .map { job -> workers.submit(job)} .toList() .forEach { future -> emit(future.get() as U) } diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt index 54c3c67..87ee5de 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt @@ -13,8 +13,9 @@ class Histogram(private val histo: MutableMap = mutableMapOf()) : I suspend fun fromBagOfWords(flow: Flow): Histogram { val result = Histogram() - flow.collect { bagOfWords -> - bagOfWords.forEach(result::add) + flow.collectIndexed { index, value -> + println(index) + value.forEach(result::add) } return result } diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt index 082b99d..0fbea75 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt @@ -7,13 +7,14 @@ import org.apache.lucene.analysis.standard.StandardTokenizer import org.apache.lucene.analysis.tokenattributes.CharTermAttribute import org.apache.lucene.util.AttributeFactory import java.io.File +import java.io.InputStream import java.io.InputStreamReader -class TextFile(val file: File) { +class TextFile(val inputStream: InputStream) { - fun splitByEmptyLines(progressOp: (read: Long) -> Unit = {}): Flow> { - val reader = InputStreamReader(ProgressInputStream(file.inputStream(), progressOp)) + fun splitByEmptyLines(): Flow> { + val reader = InputStreamReader(inputStream) var list = mutableListOf() return flow { reader.useLines { lines -> @@ -28,19 +29,19 @@ class TextFile(val file: File) { } } } - fun words(progressOp: (read: Long) -> Unit = {}): Flow { - val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY - val tokenizer = StandardTokenizer(factory) - val reader = ProgressInputStream(file.inputStream(), progressOp) - tokenizer.setReader(InputStreamReader(reader)) - tokenizer.reset() - val attr = tokenizer.addAttribute(CharTermAttribute::class.java) - return flow { - while (kotlin.runCatching { tokenizer.incrementToken() }.getOrElse { true } ) { - emit(attr.toString()) - } - }.onCompletion { - tokenizer.close() - } - } +// fun words(progressOp: (read: Long) -> Unit = {}): Flow { +// val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY +// val tokenizer = StandardTokenizer(factory) +// val reader = ProgressInputStream(file.inputStream(), progressOp) +// tokenizer.setReader(InputStreamReader(reader)) +// tokenizer.reset() +// val attr = tokenizer.addAttribute(CharTermAttribute::class.java) +// return flow { +// while (kotlin.runCatching { tokenizer.incrementToken() }.getOrElse { true } ) { +// emit(attr.toString()) +// } +// }.onCompletion { +// tokenizer.close() +// } +// } } \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt index 65a616f..82b4b8b 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt @@ -1,5 +1,6 @@ package de.itkl.tfidf +import com.github.ajalt.mordant.terminal.Terminal import de.itkl.fileprocessing.FileProcessor import de.itkl.fileprocessing.Resource import de.itkl.processing.ParallelFlowProcessor @@ -22,8 +23,8 @@ class Idf : FileProcessor { override suspend fun process(resource: Resource): File { Log.info { "Would produce: ${willProduce(resource.path)}" } val resultFile = willProduce(resource.path).toFile() - val textFile = TextFile(resource.toFile()) - val documents = textFile.splitByEmptyLines { } + val textFile = TextFile(resource.read()) + val documents = textFile.splitByEmptyLines() val bagOfWords = ParallelFlowProcessor, BagOfWords>( mapperFn = { document -> val tokenizer = Tokenizer() diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt new file mode 100644 index 0000000..151e04d --- /dev/null +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt @@ -0,0 +1,38 @@ +package de.itkl.tfidf + +import com.github.ajalt.mordant.animation.ProgressAnimation +import com.github.ajalt.mordant.animation.progressAnimation +import com.github.ajalt.mordant.terminal.Terminal +import de.itkl.fileprocessing.ProgressBar +import de.itkl.fileprocessing.ProgressBarFactory +import de.itkl.fileprocessing.Resource + +class TerminalProgressBarFactory : ProgressBarFactory { + private val terminal = Terminal() + override fun new(resource: Resource): ProgressBar { + val animation = terminal.progressAnimation { + text(resource.filename) + percentage() + progressBar() + completed() + timeRemaining() + } + return TerminalProgressBar(animation, resource.length()) + } +} + +class TerminalProgressBar( + private val animation: ProgressAnimation, total: Long) : ProgressBar { + + init { + animation.start() + animation.updateTotal(total) + } + override fun update(bytesRead: Long) { + animation.update(bytesRead) + } + + override fun close() { + animation.stop() + } +} \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt index 0e10e27..e7ed0dd 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdf.kt @@ -13,43 +13,43 @@ import kotlin.io.path.exists private val Log = KotlinLogging.logger { } -class TfIdf { - suspend fun computeTf( - corpus: File, - language: Language - ): Histogram { - Log.info { "Processing $corpus" } - val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv") - - if(destination.exists()) { - return HistogramCsvStorage().read(destination.toFile()) - } - - val filesize = corpus.length() - - val t = Terminal() - val histogram = t.progressBar("Indexing ${corpus.name}", filesize) { val stemmer = stemmer(language) - val words = TextFile(corpus).words {readBytes -> update(readBytes)} - .map { stemmer.stem(it) } - Histogram.from(words) - } - - t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) { - HistogramCsvStorage() - .save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)} - } - return histogram - } - - private fun stemmer(language: Language): SnowballStemmer { - return when(language) { - Language.DE -> GermanStemmer() - } - } - - private fun SnowballStemmer.stem(word: String): String { - current = word - stem() - return current - } -} \ No newline at end of file +//class TfIdf { +// suspend fun computeTf( +// corpus: File, +// language: Language +// ): Histogram { +// Log.info { "Processing $corpus" } +// val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv") +// +// if(destination.exists()) { +// return HistogramCsvStorage().read(destination.toFile()) +// } +// +// val filesize = corpus.length() +// +// val t = Terminal() +// val histogram = t.progressBar("Indexing ${corpus.name}", filesize) { val stemmer = stemmer(language) +// val words = TextFile(corpus).words {readBytes -> update(readBytes)} +// .map { stemmer.stem(it) } +// Histogram.from(words) +// } +// +// t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) { +// HistogramCsvStorage() +// .save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)} +// } +// return histogram +// } +// +// private fun stemmer(language: Language): SnowballStemmer { +// return when(language) { +// Language.DE -> GermanStemmer() +// } +// } +// +// private fun SnowballStemmer.stem(word: String): String { +// current = word +// stem() +// return current +// } +//} \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt index 38eb8be..ee7b4b5 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt @@ -2,9 +2,12 @@ package de.itkl.tfidf import de.itkl.fileprocessing.FileProcessingPipeline import de.itkl.fileprocessing.FileProcessor +import de.itkl.fileprocessing.ProgressBarFactory class TfIdfPipeline(private val language: Language) : FileProcessingPipeline() { override val fileProcessor = listOf( Idf() ) + override val progressBarFactory: ProgressBarFactory + get() = TerminalProgressBarFactory() } \ No newline at end of file