diff --git a/app/src/main/kotlin/docthor/app/App.kt b/app/src/main/kotlin/docthor/app/App.kt index ba7e0fa..4277558 100644 --- a/app/src/main/kotlin/docthor/app/App.kt +++ b/app/src/main/kotlin/docthor/app/App.kt @@ -6,14 +6,17 @@ import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required import com.github.ajalt.clikt.parameters.types.enum import com.github.ajalt.clikt.parameters.types.file -import de.itkl.textprocessing.TextFile +import de.itkl.fileprocessing.ProgressBarFactory +import de.itkl.textprocessing.textProcessingModule import de.itkl.tfidf.Language +import de.itkl.tfidf.TerminalProgressBarFactory //import de.itkl.tfidf.TfIdf import de.itkl.tfidf.TfIdfPipeline -import kotlinx.coroutines.flow.take import kotlinx.coroutines.runBlocking +import org.koin.core.context.startKoin +import org.koin.dsl.module -class ComputeTf : CliktCommand() { +class ComputeIdf : CliktCommand() { private val corpus by option(help = "corpus") .file() .required() @@ -22,18 +25,20 @@ class ComputeTf : CliktCommand() { .required() override fun run() = runBlocking { - TfIdfPipeline(language = Language.DE, force = true) + TfIdfPipeline(force = true) .input(corpus) -// TextFile(corpus).splitByEmptyLines() -// .take(10) -// .collect { println(it) } -// val tfIdf = TfIdf() -// val histogram = tfIdf.computeTf( -// corpus, -// language -// ) -// val tf = tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile()) } } -fun main(args: Array) = ComputeTf().main(args) +fun main(args: Array) { + startKoin { + modules( + textProcessingModule, + module { + single { + TerminalProgressBarFactory() + } + }) + ComputeIdf().main(args) + } +} diff --git a/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts b/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts index 84b11da..03ff015 100644 --- a/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts +++ b/build-logic/src/main/kotlin/docthor.kotlin-common-conventions.gradle.kts @@ -10,7 +10,9 @@ repositories { } dependencies { + val koin_version = "3.5.3" implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3") + implementation("io.insert-koin:koin-core:$koin_version") } java { diff --git a/build-logic/src/main/kotlin/docthor.kotlin-library-conventions.gradle.kts b/build-logic/src/main/kotlin/docthor.kotlin-library-conventions.gradle.kts index ad1d2e2..9ec886e 100644 --- a/build-logic/src/main/kotlin/docthor.kotlin-library-conventions.gradle.kts +++ b/build-logic/src/main/kotlin/docthor.kotlin-library-conventions.gradle.kts @@ -1,22 +1,10 @@ -import org.codehaus.groovy.tools.shell.util.Logger.io - -/* - * This file was generated by the Gradle 'init' task. - * - * This project uses @Incubating APIs which are subject to change. - */ - plugins { - // Apply the common convention plugin for shared build configuration between library and application projects. id("docthor.kotlin-common-conventions") - - // Apply the java-library plugin for API and implementation separation. `java-library` } dependencies { api("io.github.oshai:kotlin-logging-jvm:5.1.0") - implementation("org.slf4j:slf4j-api:2.0.9") } \ No newline at end of file diff --git a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt index 436d3c3..9643c63 100644 --- a/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt +++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/fileprocessing/ProcessingPipeline.kt @@ -1,17 +1,19 @@ package de.itkl.fileprocessing import io.github.oshai.kotlinlogging.KotlinLogging +import org.koin.core.annotation.KoinReflectAPI +import org.koin.core.component.KoinComponent +import org.koin.core.component.inject import java.io.File import kotlin.io.path.exists private val Log = KotlinLogging.logger { } -abstract class FileProcessingPipeline(private val force: Boolean = false) { - +abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent { protected abstract val fileProcessor: List - protected abstract val progressBarFactory: ProgressBarFactory + private val progressBarFactory: ProgressBarFactory by inject() suspend fun input(file: File) { var currentFile = file fileProcessor.forEach { processor -> diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt index 0b0741c..309382e 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/TextFile.kt @@ -1,32 +1,32 @@ package de.itkl.textprocessing +import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.flow import kotlinx.coroutines.flow.onCompletion -import org.apache.lucene.analysis.standard.StandardTokenizer -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute -import org.apache.lucene.util.AttributeFactory -import java.io.File +import kotlinx.coroutines.withContext import java.io.InputStream import java.io.InputStreamReader class TextFile(private val inputStream: InputStream) { fun splitByEmptyLines(): Flow> { - return InputStreamReader(inputStream).use { reader -> - var list = mutableListOf() - flow { - reader.useLines { lines -> - lines.forEach { line -> - if(line.isEmpty()) { - emit(list) - list = mutableListOf() - } else { - list.add(line) - } + val reader = InputStreamReader(inputStream) + var list = mutableListOf() + return flow> { + reader.useLines { lines -> + lines.forEach { line -> + if(line.isEmpty()) { + emit(list) + list = mutableListOf() + } else { + list.add(line) } } } - } + }.onCompletion { + withContext(Dispatchers.IO) { + reader.close() + } } } } \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Tokenizer.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/implementation/LuceneTokenizer.kt similarity index 73% rename from libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Tokenizer.kt rename to libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/implementation/LuceneTokenizer.kt index bd609cc..127de1c 100644 --- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Tokenizer.kt +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/implementation/LuceneTokenizer.kt @@ -1,23 +1,21 @@ -package de.itkl.textprocessing +package de.itkl.textprocessing.implementation -import kotlinx.coroutines.flow.Flow -import kotlinx.coroutines.flow.flow -import kotlinx.coroutines.flow.onCompletion +import de.itkl.textprocessing.interfaces.Tokenizer import org.apache.lucene.analysis.standard.StandardTokenizer import org.apache.lucene.analysis.tokenattributes.CharTermAttribute import org.apache.lucene.util.AttributeFactory import java.io.StringReader -class Tokenizer { +class LuceneTokenizer : Tokenizer { private val tokenizer by lazy { val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY val tokenizer = StandardTokenizer(factory) tokenizer } - fun tokenize(input: String): Sequence { - val reader = StringReader(input) + override fun tokenize(text: String): Sequence { + val reader = StringReader(text) tokenizer.setReader(reader) tokenizer.reset() val attr = tokenizer.addAttribute(CharTermAttribute::class.java) diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/implementation/SnowballStemmerGerman.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/implementation/SnowballStemmerGerman.kt new file mode 100644 index 0000000..fec9353 --- /dev/null +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/implementation/SnowballStemmerGerman.kt @@ -0,0 +1,13 @@ +package de.itkl.textprocessing.implementation + +import de.itkl.textprocessing.interfaces.Stemmer +import org.tartarus.snowball.ext.GermanStemmer + +class SnowballStemmerGerman : Stemmer { + private val german = GermanStemmer() + override fun stem(word: String): String { + german.current = word + german.stem() + return german.current + } +} \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/interfaces/Stemmer.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/interfaces/Stemmer.kt new file mode 100644 index 0000000..6848e0f --- /dev/null +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/interfaces/Stemmer.kt @@ -0,0 +1,5 @@ +package de.itkl.textprocessing.interfaces + +interface Stemmer { + fun stem(word: String): String +} \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/interfaces/Tokenizer.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/interfaces/Tokenizer.kt new file mode 100644 index 0000000..491d0ef --- /dev/null +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/interfaces/Tokenizer.kt @@ -0,0 +1,5 @@ +package de.itkl.textprocessing.interfaces + +interface Tokenizer { + fun tokenize(text: String): Sequence +} \ No newline at end of file diff --git a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/textProcessingModule.kt b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/textProcessingModule.kt new file mode 100644 index 0000000..f5bd4be --- /dev/null +++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/textProcessingModule.kt @@ -0,0 +1,12 @@ +package de.itkl.textprocessing + +import de.itkl.textprocessing.implementation.LuceneTokenizer +import de.itkl.textprocessing.implementation.SnowballStemmerGerman +import de.itkl.textprocessing.interfaces.Stemmer +import de.itkl.textprocessing.interfaces.Tokenizer +import org.koin.dsl.module + +val textProcessingModule = module { + factory { LuceneTokenizer() } + factory { SnowballStemmerGerman() } +} \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt index 1ad2aea..8b4b2dd 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/DocumentFrequency.kt @@ -4,16 +4,20 @@ import de.itkl.fileprocessing.FileProcessor import de.itkl.fileprocessing.Resource import de.itkl.processing.parallelUnordered import de.itkl.textprocessing.* +import de.itkl.textprocessing.interfaces.Stemmer +import de.itkl.textprocessing.interfaces.Tokenizer import io.github.oshai.kotlinlogging.KotlinLogging import kotlinx.coroutines.* import kotlinx.coroutines.flow.* +import org.koin.core.component.KoinComponent +import org.koin.core.component.inject import java.io.File import java.nio.file.Path import kotlin.io.path.nameWithoutExtension private val Log = KotlinLogging.logger { } -class DocumentFrequency : FileProcessor { +class DocumentFrequency : FileProcessor, KoinComponent { override fun willProduce(path: Path): Path { return path.parent.resolve(path.nameWithoutExtension + "-document-frequency.csv") } @@ -37,10 +41,11 @@ class DocumentFrequency : FileProcessor { if (document.isEmpty()) { return Histogram() } - val tokenizer = Tokenizer() + val tokenizer: Tokenizer by inject() + val stemmer: Stemmer by inject() val bagOfWords = document.map { line -> val tokens = tokenizer.tokenize(line) - BagOfWords.from(tokens) + BagOfWords.from(tokens.map { stemmer.stem(it) }) } .reduce { acc, bagOfWords -> acc.join(bagOfWords) } return Histogram.fromBagOfWords(bagOfWords) diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt index 151e04d..e53d6bd 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TerminalProgressBarFactory.kt @@ -34,5 +34,6 @@ class TerminalProgressBar( override fun close() { animation.stop() + println() } } \ No newline at end of file diff --git a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt index d91312a..41922fc 100644 --- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt +++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/TfIdfPipeline.kt @@ -2,11 +2,10 @@ package de.itkl.tfidf import de.itkl.fileprocessing.FileProcessingPipeline import de.itkl.fileprocessing.ProgressBarFactory +import org.koin.core.component.KoinComponent -class TfIdfPipeline(private val language: Language, force: Boolean) : FileProcessingPipeline(force) { +class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) { override val fileProcessor = listOf( DocumentFrequency() ) - override val progressBarFactory: ProgressBarFactory - get() = TerminalProgressBarFactory() } \ No newline at end of file