utilize koin
parent
606837a76f
commit
81a30dd2f6
|
|
@ -6,14 +6,17 @@ import com.github.ajalt.clikt.parameters.options.option
|
|||
import com.github.ajalt.clikt.parameters.options.required
|
||||
import com.github.ajalt.clikt.parameters.types.enum
|
||||
import com.github.ajalt.clikt.parameters.types.file
|
||||
import de.itkl.textprocessing.TextFile
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import de.itkl.textprocessing.textProcessingModule
|
||||
import de.itkl.tfidf.Language
|
||||
import de.itkl.tfidf.TerminalProgressBarFactory
|
||||
//import de.itkl.tfidf.TfIdf
|
||||
import de.itkl.tfidf.TfIdfPipeline
|
||||
import kotlinx.coroutines.flow.take
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import org.koin.core.context.startKoin
|
||||
import org.koin.dsl.module
|
||||
|
||||
class ComputeTf : CliktCommand() {
|
||||
class ComputeIdf : CliktCommand() {
|
||||
private val corpus by option(help = "corpus")
|
||||
.file()
|
||||
.required()
|
||||
|
|
@ -22,18 +25,20 @@ class ComputeTf : CliktCommand() {
|
|||
.required()
|
||||
|
||||
override fun run() = runBlocking {
|
||||
TfIdfPipeline(language = Language.DE, force = true)
|
||||
TfIdfPipeline(force = true)
|
||||
.input(corpus)
|
||||
// TextFile(corpus).splitByEmptyLines()
|
||||
// .take(10)
|
||||
// .collect { println(it) }
|
||||
// val tfIdf = TfIdf()
|
||||
// val histogram = tfIdf.computeTf(
|
||||
// corpus,
|
||||
// language
|
||||
// )
|
||||
// val tf = tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
|
||||
}
|
||||
}
|
||||
|
||||
fun main(args: Array<String>) = ComputeTf().main(args)
|
||||
fun main(args: Array<String>) {
|
||||
startKoin {
|
||||
modules(
|
||||
textProcessingModule,
|
||||
module {
|
||||
single<ProgressBarFactory> {
|
||||
TerminalProgressBarFactory()
|
||||
}
|
||||
})
|
||||
ComputeIdf().main(args)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,7 +10,9 @@ repositories {
|
|||
}
|
||||
|
||||
dependencies {
|
||||
val koin_version = "3.5.3"
|
||||
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
|
||||
implementation("io.insert-koin:koin-core:$koin_version")
|
||||
}
|
||||
|
||||
java {
|
||||
|
|
|
|||
|
|
@ -1,22 +1,10 @@
|
|||
import org.codehaus.groovy.tools.shell.util.Logger.io
|
||||
|
||||
/*
|
||||
* This file was generated by the Gradle 'init' task.
|
||||
*
|
||||
* This project uses @Incubating APIs which are subject to change.
|
||||
*/
|
||||
|
||||
plugins {
|
||||
// Apply the common convention plugin for shared build configuration between library and application projects.
|
||||
id("docthor.kotlin-common-conventions")
|
||||
|
||||
// Apply the java-library plugin for API and implementation separation.
|
||||
`java-library`
|
||||
}
|
||||
|
||||
|
||||
dependencies {
|
||||
api("io.github.oshai:kotlin-logging-jvm:5.1.0")
|
||||
|
||||
implementation("org.slf4j:slf4j-api:2.0.9")
|
||||
}
|
||||
|
|
@ -1,17 +1,19 @@
|
|||
package de.itkl.fileprocessing
|
||||
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import org.koin.core.annotation.KoinReflectAPI
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
import java.io.File
|
||||
import kotlin.io.path.exists
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
|
||||
abstract class FileProcessingPipeline(private val force: Boolean = false) {
|
||||
|
||||
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
|
||||
|
||||
|
||||
protected abstract val fileProcessor: List<FileProcessor>
|
||||
protected abstract val progressBarFactory: ProgressBarFactory
|
||||
private val progressBarFactory: ProgressBarFactory by inject()
|
||||
suspend fun input(file: File) {
|
||||
var currentFile = file
|
||||
fileProcessor.forEach { processor ->
|
||||
|
|
|
|||
|
|
@ -1,21 +1,19 @@
|
|||
package de.itkl.textprocessing
|
||||
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
import kotlinx.coroutines.flow.flow
|
||||
import kotlinx.coroutines.flow.onCompletion
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
|
||||
import org.apache.lucene.util.AttributeFactory
|
||||
import java.io.File
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.InputStream
|
||||
import java.io.InputStreamReader
|
||||
|
||||
|
||||
class TextFile(private val inputStream: InputStream) {
|
||||
fun splitByEmptyLines(): Flow<List<String>> {
|
||||
return InputStreamReader(inputStream).use { reader ->
|
||||
val reader = InputStreamReader(inputStream)
|
||||
var list = mutableListOf<String>()
|
||||
flow {
|
||||
return flow<List<String>> {
|
||||
reader.useLines { lines ->
|
||||
lines.forEach { line ->
|
||||
if(line.isEmpty()) {
|
||||
|
|
@ -26,7 +24,9 @@ class TextFile(private val inputStream: InputStream) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}.onCompletion {
|
||||
withContext(Dispatchers.IO) {
|
||||
reader.close()
|
||||
} }
|
||||
}
|
||||
}
|
||||
|
|
@ -1,23 +1,21 @@
|
|||
package de.itkl.textprocessing
|
||||
package de.itkl.textprocessing.implementation
|
||||
|
||||
import kotlinx.coroutines.flow.Flow
|
||||
import kotlinx.coroutines.flow.flow
|
||||
import kotlinx.coroutines.flow.onCompletion
|
||||
import de.itkl.textprocessing.interfaces.Tokenizer
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
|
||||
import org.apache.lucene.util.AttributeFactory
|
||||
import java.io.StringReader
|
||||
|
||||
|
||||
class Tokenizer {
|
||||
class LuceneTokenizer : Tokenizer {
|
||||
|
||||
private val tokenizer by lazy {
|
||||
val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
|
||||
val tokenizer = StandardTokenizer(factory)
|
||||
tokenizer
|
||||
}
|
||||
fun tokenize(input: String): Sequence<String> {
|
||||
val reader = StringReader(input)
|
||||
override fun tokenize(text: String): Sequence<String> {
|
||||
val reader = StringReader(text)
|
||||
tokenizer.setReader(reader)
|
||||
tokenizer.reset()
|
||||
val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
package de.itkl.textprocessing.implementation
|
||||
|
||||
import de.itkl.textprocessing.interfaces.Stemmer
|
||||
import org.tartarus.snowball.ext.GermanStemmer
|
||||
|
||||
class SnowballStemmerGerman : Stemmer {
|
||||
private val german = GermanStemmer()
|
||||
override fun stem(word: String): String {
|
||||
german.current = word
|
||||
german.stem()
|
||||
return german.current
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
package de.itkl.textprocessing.interfaces
|
||||
|
||||
interface Stemmer {
|
||||
fun stem(word: String): String
|
||||
}
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
package de.itkl.textprocessing.interfaces
|
||||
|
||||
interface Tokenizer {
|
||||
fun tokenize(text: String): Sequence<String>
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
package de.itkl.textprocessing
|
||||
|
||||
import de.itkl.textprocessing.implementation.LuceneTokenizer
|
||||
import de.itkl.textprocessing.implementation.SnowballStemmerGerman
|
||||
import de.itkl.textprocessing.interfaces.Stemmer
|
||||
import de.itkl.textprocessing.interfaces.Tokenizer
|
||||
import org.koin.dsl.module
|
||||
|
||||
val textProcessingModule = module {
|
||||
factory<Tokenizer> { LuceneTokenizer() }
|
||||
factory<Stemmer> { SnowballStemmerGerman() }
|
||||
}
|
||||
|
|
@ -4,16 +4,20 @@ import de.itkl.fileprocessing.FileProcessor
|
|||
import de.itkl.fileprocessing.Resource
|
||||
import de.itkl.processing.parallelUnordered
|
||||
import de.itkl.textprocessing.*
|
||||
import de.itkl.textprocessing.interfaces.Stemmer
|
||||
import de.itkl.textprocessing.interfaces.Tokenizer
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import kotlinx.coroutines.*
|
||||
import kotlinx.coroutines.flow.*
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.nameWithoutExtension
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
|
||||
class DocumentFrequency : FileProcessor {
|
||||
class DocumentFrequency : FileProcessor, KoinComponent {
|
||||
override fun willProduce(path: Path): Path {
|
||||
return path.parent.resolve(path.nameWithoutExtension + "-document-frequency.csv")
|
||||
}
|
||||
|
|
@ -37,10 +41,11 @@ class DocumentFrequency : FileProcessor {
|
|||
if (document.isEmpty()) {
|
||||
return Histogram()
|
||||
}
|
||||
val tokenizer = Tokenizer()
|
||||
val tokenizer: Tokenizer by inject()
|
||||
val stemmer: Stemmer by inject()
|
||||
val bagOfWords = document.map { line ->
|
||||
val tokens = tokenizer.tokenize(line)
|
||||
BagOfWords.from(tokens)
|
||||
BagOfWords.from(tokens.map { stemmer.stem(it) })
|
||||
}
|
||||
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
|
||||
return Histogram.fromBagOfWords(bagOfWords)
|
||||
|
|
|
|||
|
|
@ -34,5 +34,6 @@ class TerminalProgressBar(
|
|||
|
||||
override fun close() {
|
||||
animation.stop()
|
||||
println()
|
||||
}
|
||||
}
|
||||
|
|
@ -2,11 +2,10 @@ package de.itkl.tfidf
|
|||
|
||||
import de.itkl.fileprocessing.FileProcessingPipeline
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import org.koin.core.component.KoinComponent
|
||||
|
||||
class TfIdfPipeline(private val language: Language, force: Boolean) : FileProcessingPipeline(force) {
|
||||
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
||||
override val fileProcessor = listOf(
|
||||
DocumentFrequency()
|
||||
)
|
||||
override val progressBarFactory: ProgressBarFactory
|
||||
get() = TerminalProgressBarFactory()
|
||||
}
|
||||
Loading…
Reference in New Issue