utilize koin
parent
606837a76f
commit
81a30dd2f6
|
|
@ -6,14 +6,17 @@ import com.github.ajalt.clikt.parameters.options.option
|
||||||
import com.github.ajalt.clikt.parameters.options.required
|
import com.github.ajalt.clikt.parameters.options.required
|
||||||
import com.github.ajalt.clikt.parameters.types.enum
|
import com.github.ajalt.clikt.parameters.types.enum
|
||||||
import com.github.ajalt.clikt.parameters.types.file
|
import com.github.ajalt.clikt.parameters.types.file
|
||||||
import de.itkl.textprocessing.TextFile
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
|
import de.itkl.textprocessing.textProcessingModule
|
||||||
import de.itkl.tfidf.Language
|
import de.itkl.tfidf.Language
|
||||||
|
import de.itkl.tfidf.TerminalProgressBarFactory
|
||||||
//import de.itkl.tfidf.TfIdf
|
//import de.itkl.tfidf.TfIdf
|
||||||
import de.itkl.tfidf.TfIdfPipeline
|
import de.itkl.tfidf.TfIdfPipeline
|
||||||
import kotlinx.coroutines.flow.take
|
|
||||||
import kotlinx.coroutines.runBlocking
|
import kotlinx.coroutines.runBlocking
|
||||||
|
import org.koin.core.context.startKoin
|
||||||
|
import org.koin.dsl.module
|
||||||
|
|
||||||
class ComputeTf : CliktCommand() {
|
class ComputeIdf : CliktCommand() {
|
||||||
private val corpus by option(help = "corpus")
|
private val corpus by option(help = "corpus")
|
||||||
.file()
|
.file()
|
||||||
.required()
|
.required()
|
||||||
|
|
@ -22,18 +25,20 @@ class ComputeTf : CliktCommand() {
|
||||||
.required()
|
.required()
|
||||||
|
|
||||||
override fun run() = runBlocking {
|
override fun run() = runBlocking {
|
||||||
TfIdfPipeline(language = Language.DE, force = true)
|
TfIdfPipeline(force = true)
|
||||||
.input(corpus)
|
.input(corpus)
|
||||||
// TextFile(corpus).splitByEmptyLines()
|
|
||||||
// .take(10)
|
|
||||||
// .collect { println(it) }
|
|
||||||
// val tfIdf = TfIdf()
|
|
||||||
// val histogram = tfIdf.computeTf(
|
|
||||||
// corpus,
|
|
||||||
// language
|
|
||||||
// )
|
|
||||||
// val tf = tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fun main(args: Array<String>) = ComputeTf().main(args)
|
fun main(args: Array<String>) {
|
||||||
|
startKoin {
|
||||||
|
modules(
|
||||||
|
textProcessingModule,
|
||||||
|
module {
|
||||||
|
single<ProgressBarFactory> {
|
||||||
|
TerminalProgressBarFactory()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
ComputeIdf().main(args)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,9 @@ repositories {
|
||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
|
val koin_version = "3.5.3"
|
||||||
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
|
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
|
||||||
|
implementation("io.insert-koin:koin-core:$koin_version")
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
|
|
|
||||||
|
|
@ -1,22 +1,10 @@
|
||||||
import org.codehaus.groovy.tools.shell.util.Logger.io
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This file was generated by the Gradle 'init' task.
|
|
||||||
*
|
|
||||||
* This project uses @Incubating APIs which are subject to change.
|
|
||||||
*/
|
|
||||||
|
|
||||||
plugins {
|
plugins {
|
||||||
// Apply the common convention plugin for shared build configuration between library and application projects.
|
|
||||||
id("docthor.kotlin-common-conventions")
|
id("docthor.kotlin-common-conventions")
|
||||||
|
|
||||||
// Apply the java-library plugin for API and implementation separation.
|
|
||||||
`java-library`
|
`java-library`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
api("io.github.oshai:kotlin-logging-jvm:5.1.0")
|
api("io.github.oshai:kotlin-logging-jvm:5.1.0")
|
||||||
|
|
||||||
implementation("org.slf4j:slf4j-api:2.0.9")
|
implementation("org.slf4j:slf4j-api:2.0.9")
|
||||||
}
|
}
|
||||||
|
|
@ -1,17 +1,19 @@
|
||||||
package de.itkl.fileprocessing
|
package de.itkl.fileprocessing
|
||||||
|
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
|
import org.koin.core.annotation.KoinReflectAPI
|
||||||
|
import org.koin.core.component.KoinComponent
|
||||||
|
import org.koin.core.component.inject
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import kotlin.io.path.exists
|
import kotlin.io.path.exists
|
||||||
|
|
||||||
private val Log = KotlinLogging.logger { }
|
private val Log = KotlinLogging.logger { }
|
||||||
|
|
||||||
abstract class FileProcessingPipeline(private val force: Boolean = false) {
|
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
protected abstract val fileProcessor: List<FileProcessor>
|
protected abstract val fileProcessor: List<FileProcessor>
|
||||||
protected abstract val progressBarFactory: ProgressBarFactory
|
private val progressBarFactory: ProgressBarFactory by inject()
|
||||||
suspend fun input(file: File) {
|
suspend fun input(file: File) {
|
||||||
var currentFile = file
|
var currentFile = file
|
||||||
fileProcessor.forEach { processor ->
|
fileProcessor.forEach { processor ->
|
||||||
|
|
|
||||||
|
|
@ -1,21 +1,19 @@
|
||||||
package de.itkl.textprocessing
|
package de.itkl.textprocessing
|
||||||
|
|
||||||
|
import kotlinx.coroutines.Dispatchers
|
||||||
import kotlinx.coroutines.flow.Flow
|
import kotlinx.coroutines.flow.Flow
|
||||||
import kotlinx.coroutines.flow.flow
|
import kotlinx.coroutines.flow.flow
|
||||||
import kotlinx.coroutines.flow.onCompletion
|
import kotlinx.coroutines.flow.onCompletion
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer
|
import kotlinx.coroutines.withContext
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
|
|
||||||
import org.apache.lucene.util.AttributeFactory
|
|
||||||
import java.io.File
|
|
||||||
import java.io.InputStream
|
import java.io.InputStream
|
||||||
import java.io.InputStreamReader
|
import java.io.InputStreamReader
|
||||||
|
|
||||||
|
|
||||||
class TextFile(private val inputStream: InputStream) {
|
class TextFile(private val inputStream: InputStream) {
|
||||||
fun splitByEmptyLines(): Flow<List<String>> {
|
fun splitByEmptyLines(): Flow<List<String>> {
|
||||||
return InputStreamReader(inputStream).use { reader ->
|
val reader = InputStreamReader(inputStream)
|
||||||
var list = mutableListOf<String>()
|
var list = mutableListOf<String>()
|
||||||
flow {
|
return flow<List<String>> {
|
||||||
reader.useLines { lines ->
|
reader.useLines { lines ->
|
||||||
lines.forEach { line ->
|
lines.forEach { line ->
|
||||||
if(line.isEmpty()) {
|
if(line.isEmpty()) {
|
||||||
|
|
@ -26,7 +24,9 @@ class TextFile(private val inputStream: InputStream) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}.onCompletion {
|
||||||
}
|
withContext(Dispatchers.IO) {
|
||||||
|
reader.close()
|
||||||
|
} }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,23 +1,21 @@
|
||||||
package de.itkl.textprocessing
|
package de.itkl.textprocessing.implementation
|
||||||
|
|
||||||
import kotlinx.coroutines.flow.Flow
|
import de.itkl.textprocessing.interfaces.Tokenizer
|
||||||
import kotlinx.coroutines.flow.flow
|
|
||||||
import kotlinx.coroutines.flow.onCompletion
|
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer
|
import org.apache.lucene.analysis.standard.StandardTokenizer
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
|
||||||
import org.apache.lucene.util.AttributeFactory
|
import org.apache.lucene.util.AttributeFactory
|
||||||
import java.io.StringReader
|
import java.io.StringReader
|
||||||
|
|
||||||
|
|
||||||
class Tokenizer {
|
class LuceneTokenizer : Tokenizer {
|
||||||
|
|
||||||
private val tokenizer by lazy {
|
private val tokenizer by lazy {
|
||||||
val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
|
val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
|
||||||
val tokenizer = StandardTokenizer(factory)
|
val tokenizer = StandardTokenizer(factory)
|
||||||
tokenizer
|
tokenizer
|
||||||
}
|
}
|
||||||
fun tokenize(input: String): Sequence<String> {
|
override fun tokenize(text: String): Sequence<String> {
|
||||||
val reader = StringReader(input)
|
val reader = StringReader(text)
|
||||||
tokenizer.setReader(reader)
|
tokenizer.setReader(reader)
|
||||||
tokenizer.reset()
|
tokenizer.reset()
|
||||||
val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
|
val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
package de.itkl.textprocessing.implementation
|
||||||
|
|
||||||
|
import de.itkl.textprocessing.interfaces.Stemmer
|
||||||
|
import org.tartarus.snowball.ext.GermanStemmer
|
||||||
|
|
||||||
|
class SnowballStemmerGerman : Stemmer {
|
||||||
|
private val german = GermanStemmer()
|
||||||
|
override fun stem(word: String): String {
|
||||||
|
german.current = word
|
||||||
|
german.stem()
|
||||||
|
return german.current
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
package de.itkl.textprocessing.interfaces
|
||||||
|
|
||||||
|
interface Stemmer {
|
||||||
|
fun stem(word: String): String
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
package de.itkl.textprocessing.interfaces
|
||||||
|
|
||||||
|
interface Tokenizer {
|
||||||
|
fun tokenize(text: String): Sequence<String>
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
package de.itkl.textprocessing
|
||||||
|
|
||||||
|
import de.itkl.textprocessing.implementation.LuceneTokenizer
|
||||||
|
import de.itkl.textprocessing.implementation.SnowballStemmerGerman
|
||||||
|
import de.itkl.textprocessing.interfaces.Stemmer
|
||||||
|
import de.itkl.textprocessing.interfaces.Tokenizer
|
||||||
|
import org.koin.dsl.module
|
||||||
|
|
||||||
|
val textProcessingModule = module {
|
||||||
|
factory<Tokenizer> { LuceneTokenizer() }
|
||||||
|
factory<Stemmer> { SnowballStemmerGerman() }
|
||||||
|
}
|
||||||
|
|
@ -4,16 +4,20 @@ import de.itkl.fileprocessing.FileProcessor
|
||||||
import de.itkl.fileprocessing.Resource
|
import de.itkl.fileprocessing.Resource
|
||||||
import de.itkl.processing.parallelUnordered
|
import de.itkl.processing.parallelUnordered
|
||||||
import de.itkl.textprocessing.*
|
import de.itkl.textprocessing.*
|
||||||
|
import de.itkl.textprocessing.interfaces.Stemmer
|
||||||
|
import de.itkl.textprocessing.interfaces.Tokenizer
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
import kotlinx.coroutines.*
|
import kotlinx.coroutines.*
|
||||||
import kotlinx.coroutines.flow.*
|
import kotlinx.coroutines.flow.*
|
||||||
|
import org.koin.core.component.KoinComponent
|
||||||
|
import org.koin.core.component.inject
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.nameWithoutExtension
|
import kotlin.io.path.nameWithoutExtension
|
||||||
|
|
||||||
private val Log = KotlinLogging.logger { }
|
private val Log = KotlinLogging.logger { }
|
||||||
|
|
||||||
class DocumentFrequency : FileProcessor {
|
class DocumentFrequency : FileProcessor, KoinComponent {
|
||||||
override fun willProduce(path: Path): Path {
|
override fun willProduce(path: Path): Path {
|
||||||
return path.parent.resolve(path.nameWithoutExtension + "-document-frequency.csv")
|
return path.parent.resolve(path.nameWithoutExtension + "-document-frequency.csv")
|
||||||
}
|
}
|
||||||
|
|
@ -37,10 +41,11 @@ class DocumentFrequency : FileProcessor {
|
||||||
if (document.isEmpty()) {
|
if (document.isEmpty()) {
|
||||||
return Histogram()
|
return Histogram()
|
||||||
}
|
}
|
||||||
val tokenizer = Tokenizer()
|
val tokenizer: Tokenizer by inject()
|
||||||
|
val stemmer: Stemmer by inject()
|
||||||
val bagOfWords = document.map { line ->
|
val bagOfWords = document.map { line ->
|
||||||
val tokens = tokenizer.tokenize(line)
|
val tokens = tokenizer.tokenize(line)
|
||||||
BagOfWords.from(tokens)
|
BagOfWords.from(tokens.map { stemmer.stem(it) })
|
||||||
}
|
}
|
||||||
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
|
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
|
||||||
return Histogram.fromBagOfWords(bagOfWords)
|
return Histogram.fromBagOfWords(bagOfWords)
|
||||||
|
|
|
||||||
|
|
@ -34,5 +34,6 @@ class TerminalProgressBar(
|
||||||
|
|
||||||
override fun close() {
|
override fun close() {
|
||||||
animation.stop()
|
animation.stop()
|
||||||
|
println()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2,11 +2,10 @@ package de.itkl.tfidf
|
||||||
|
|
||||||
import de.itkl.fileprocessing.FileProcessingPipeline
|
import de.itkl.fileprocessing.FileProcessingPipeline
|
||||||
import de.itkl.fileprocessing.ProgressBarFactory
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
|
import org.koin.core.component.KoinComponent
|
||||||
|
|
||||||
class TfIdfPipeline(private val language: Language, force: Boolean) : FileProcessingPipeline(force) {
|
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
||||||
override val fileProcessor = listOf(
|
override val fileProcessor = listOf(
|
||||||
DocumentFrequency()
|
DocumentFrequency()
|
||||||
)
|
)
|
||||||
override val progressBarFactory: ProgressBarFactory
|
|
||||||
get() = TerminalProgressBarFactory()
|
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue