utilize koin

develop
Timo Bryant 2023-12-21 18:16:12 +01:00
parent 606837a76f
commit 81a30dd2f6
13 changed files with 93 additions and 58 deletions

View File

@ -6,14 +6,17 @@ import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.required
import com.github.ajalt.clikt.parameters.types.enum
import com.github.ajalt.clikt.parameters.types.file
import de.itkl.textprocessing.TextFile
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.textprocessing.textProcessingModule
import de.itkl.tfidf.Language
import de.itkl.tfidf.TerminalProgressBarFactory
//import de.itkl.tfidf.TfIdf
import de.itkl.tfidf.TfIdfPipeline
import kotlinx.coroutines.flow.take
import kotlinx.coroutines.runBlocking
import org.koin.core.context.startKoin
import org.koin.dsl.module
class ComputeTf : CliktCommand() {
class ComputeIdf : CliktCommand() {
private val corpus by option(help = "corpus")
.file()
.required()
@ -22,18 +25,20 @@ class ComputeTf : CliktCommand() {
.required()
override fun run() = runBlocking {
TfIdfPipeline(language = Language.DE, force = true)
TfIdfPipeline(force = true)
.input(corpus)
// TextFile(corpus).splitByEmptyLines()
// .take(10)
// .collect { println(it) }
// val tfIdf = TfIdf()
// val histogram = tfIdf.computeTf(
// corpus,
// language
// )
// val tf = tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
}
}
fun main(args: Array<String>) = ComputeTf().main(args)
fun main(args: Array<String>) {
startKoin {
modules(
textProcessingModule,
module {
single<ProgressBarFactory> {
TerminalProgressBarFactory()
}
})
ComputeIdf().main(args)
}
}

View File

@ -10,7 +10,9 @@ repositories {
}
dependencies {
val koin_version = "3.5.3"
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
implementation("io.insert-koin:koin-core:$koin_version")
}
java {

View File

@ -1,22 +1,10 @@
import org.codehaus.groovy.tools.shell.util.Logger.io
/*
* This file was generated by the Gradle 'init' task.
*
* This project uses @Incubating APIs which are subject to change.
*/
plugins {
// Apply the common convention plugin for shared build configuration between library and application projects.
id("docthor.kotlin-common-conventions")
// Apply the java-library plugin for API and implementation separation.
`java-library`
}
dependencies {
api("io.github.oshai:kotlin-logging-jvm:5.1.0")
implementation("org.slf4j:slf4j-api:2.0.9")
}

View File

@ -1,17 +1,19 @@
package de.itkl.fileprocessing
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.annotation.KoinReflectAPI
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import kotlin.io.path.exists
private val Log = KotlinLogging.logger { }
abstract class FileProcessingPipeline(private val force: Boolean = false) {
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
protected abstract val fileProcessor: List<FileProcessor>
protected abstract val progressBarFactory: ProgressBarFactory
private val progressBarFactory: ProgressBarFactory by inject()
suspend fun input(file: File) {
var currentFile = file
fileProcessor.forEach { processor ->

View File

@ -1,32 +1,32 @@
package de.itkl.textprocessing
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.onCompletion
import org.apache.lucene.analysis.standard.StandardTokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.AttributeFactory
import java.io.File
import kotlinx.coroutines.withContext
import java.io.InputStream
import java.io.InputStreamReader
class TextFile(private val inputStream: InputStream) {
fun splitByEmptyLines(): Flow<List<String>> {
return InputStreamReader(inputStream).use { reader ->
var list = mutableListOf<String>()
flow {
reader.useLines { lines ->
lines.forEach { line ->
if(line.isEmpty()) {
emit(list)
list = mutableListOf()
} else {
list.add(line)
}
val reader = InputStreamReader(inputStream)
var list = mutableListOf<String>()
return flow<List<String>> {
reader.useLines { lines ->
lines.forEach { line ->
if(line.isEmpty()) {
emit(list)
list = mutableListOf()
} else {
list.add(line)
}
}
}
}
}.onCompletion {
withContext(Dispatchers.IO) {
reader.close()
} }
}
}

View File

@ -1,23 +1,21 @@
package de.itkl.textprocessing
package de.itkl.textprocessing.implementation
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.onCompletion
import de.itkl.textprocessing.interfaces.Tokenizer
import org.apache.lucene.analysis.standard.StandardTokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.AttributeFactory
import java.io.StringReader
class Tokenizer {
class LuceneTokenizer : Tokenizer {
private val tokenizer by lazy {
val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
val tokenizer = StandardTokenizer(factory)
tokenizer
}
fun tokenize(input: String): Sequence<String> {
val reader = StringReader(input)
override fun tokenize(text: String): Sequence<String> {
val reader = StringReader(text)
tokenizer.setReader(reader)
tokenizer.reset()
val attr = tokenizer.addAttribute(CharTermAttribute::class.java)

View File

@ -0,0 +1,13 @@
package de.itkl.textprocessing.implementation
import de.itkl.textprocessing.interfaces.Stemmer
import org.tartarus.snowball.ext.GermanStemmer
class SnowballStemmerGerman : Stemmer {
private val german = GermanStemmer()
override fun stem(word: String): String {
german.current = word
german.stem()
return german.current
}
}

View File

@ -0,0 +1,5 @@
package de.itkl.textprocessing.interfaces
interface Stemmer {
fun stem(word: String): String
}

View File

@ -0,0 +1,5 @@
package de.itkl.textprocessing.interfaces
interface Tokenizer {
fun tokenize(text: String): Sequence<String>
}

View File

@ -0,0 +1,12 @@
package de.itkl.textprocessing
import de.itkl.textprocessing.implementation.LuceneTokenizer
import de.itkl.textprocessing.implementation.SnowballStemmerGerman
import de.itkl.textprocessing.interfaces.Stemmer
import de.itkl.textprocessing.interfaces.Tokenizer
import org.koin.dsl.module
val textProcessingModule = module {
factory<Tokenizer> { LuceneTokenizer() }
factory<Stemmer> { SnowballStemmerGerman() }
}

View File

@ -4,16 +4,20 @@ import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource
import de.itkl.processing.parallelUnordered
import de.itkl.textprocessing.*
import de.itkl.textprocessing.interfaces.Stemmer
import de.itkl.textprocessing.interfaces.Tokenizer
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.*
import kotlinx.coroutines.flow.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
private val Log = KotlinLogging.logger { }
class DocumentFrequency : FileProcessor {
class DocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-document-frequency.csv")
}
@ -37,10 +41,11 @@ class DocumentFrequency : FileProcessor {
if (document.isEmpty()) {
return Histogram()
}
val tokenizer = Tokenizer()
val tokenizer: Tokenizer by inject()
val stemmer: Stemmer by inject()
val bagOfWords = document.map { line ->
val tokens = tokenizer.tokenize(line)
BagOfWords.from(tokens)
BagOfWords.from(tokens.map { stemmer.stem(it) })
}
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
return Histogram.fromBagOfWords(bagOfWords)

View File

@ -34,5 +34,6 @@ class TerminalProgressBar(
override fun close() {
animation.stop()
println()
}
}

View File

@ -2,11 +2,10 @@ package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent
class TfIdfPipeline(private val language: Language, force: Boolean) : FileProcessingPipeline(force) {
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
override val fileProcessor = listOf(
DocumentFrequency()
)
override val progressBarFactory: ProgressBarFactory
get() = TerminalProgressBarFactory()
}