Compare commits

...

10 Commits

Author SHA1 Message Date
Timo Bryant 78af3f0d50 add resource and FileSystemResource 2023-12-22 00:57:49 +01:00
Timo Bryant d973262dbd starting with io 2023-12-22 00:39:10 +01:00
Timo Bryant 1ef987f611 rework build logic 2023-12-22 00:18:53 +01:00
Timo Bryant c40ab54012 use progress.step 2023-12-21 23:40:51 +01:00
Timo Bryant 3e5534f184 maybe idf is correct now :D 2023-12-21 23:40:25 +01:00
Timo Bryant 81a30dd2f6 utilize koin 2023-12-21 18:16:12 +01:00
Timo Bryant 606837a76f code cleanup 2023-12-21 17:31:09 +01:00
Timo Bryant 46f1c49ab1 fix build 2023-12-21 17:12:42 +01:00
Timo Bryant 4cafac4583 refactoring into parallelUnordered method 2023-12-18 22:55:29 +01:00
Timo Bryant 13110fa8e5 paralleling finally works 2023-12-18 21:59:15 +01:00
44 changed files with 413 additions and 295 deletions

View File

@ -6,14 +6,17 @@ import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.required
import com.github.ajalt.clikt.parameters.types.enum
import com.github.ajalt.clikt.parameters.types.file
import de.itkl.textprocessing.TextFile
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.textprocessing.textProcessingModule
import de.itkl.tfidf.Language
import de.itkl.tfidf.TerminalProgressBarFactory
//import de.itkl.tfidf.TfIdf
import de.itkl.tfidf.TfIdfPipeline
import kotlinx.coroutines.flow.take
import kotlinx.coroutines.runBlocking
import org.koin.core.context.startKoin
import org.koin.dsl.module
class ComputeTf : CliktCommand() {
class ComputeIdf : CliktCommand() {
private val corpus by option(help = "corpus")
.file()
.required()
@ -22,18 +25,20 @@ class ComputeTf : CliktCommand() {
.required()
override fun run() = runBlocking {
TfIdfPipeline(language = Language.DE)
TfIdfPipeline(force = true)
.input(corpus)
// TextFile(corpus).splitByEmptyLines()
// .take(10)
// .collect { println(it) }
// val tfIdf = TfIdf()
// val histogram = tfIdf.computeTf(
// corpus,
// language
// )
// val tf = tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
}
}
fun main(args: Array<String>) = ComputeTf().main(args)
fun main(args: Array<String>) {
startKoin {
modules(
textProcessingModule,
module {
single<ProgressBarFactory> {
TerminalProgressBarFactory()
}
})
ComputeIdf().main(args)
}
}

View File

@ -1,19 +0,0 @@
/*
* This file was generated by the Gradle 'init' task.
*
* This project uses @Incubating APIs which are subject to change.
*/
plugins {
// Support convention plugins written in Kotlin. Convention plugins are build scripts in 'src/main' that automatically become available as plugins in the main build.
`kotlin-dsl`
}
repositories {
// Use the plugin portal to apply community plugins in convention plugins.
gradlePluginPortal()
}
dependencies {
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.20")
}

View File

@ -1,8 +0,0 @@
/*
* This file was generated by the Gradle 'init' task.
*
* This settings file is used to specify which projects to include in your build-logic build.
* This project uses @Incubating APIs which are subject to change.
*/
rootProject.name = "docthor-build-logic"

View File

@ -1,22 +0,0 @@
import org.codehaus.groovy.tools.shell.util.Logger.io
/*
* This file was generated by the Gradle 'init' task.
*
* This project uses @Incubating APIs which are subject to change.
*/
plugins {
// Apply the common convention plugin for shared build configuration between library and application projects.
id("docthor.kotlin-common-conventions")
// Apply the java-library plugin for API and implementation separation.
`java-library`
}
dependencies {
api("io.github.oshai:kotlin-logging-jvm:5.1.0")
implementation("org.slf4j:slf4j-api:2.0.9")
}

3
build.gradle.kts Normal file
View File

@ -0,0 +1,3 @@
project(":libraries").subprojects {
apply(plugin = "docthor.kotlin-library-conventions")
}

11
buildSrc/build.gradle.kts Normal file
View File

@ -0,0 +1,11 @@
plugins {
`kotlin-dsl`
}
repositories {
gradlePluginPortal()
}
dependencies {
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.20")
}

View File

@ -0,0 +1 @@
rootProject.name = "docthor-build-logic"

View File

@ -1,4 +1,5 @@
import org.gradle.api.plugins.jvm.JvmTestSuite
import org.jetbrains.kotlin.gradle.dsl.JvmTarget
plugins {
id("org.jetbrains.kotlin.jvm")
@ -9,7 +10,23 @@ repositories {
}
dependencies {
val koin_version = "3.5.3"
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
implementation("io.insert-koin:koin-core:$koin_version")
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of("19"))
}
}
tasks
.withType<org.jetbrains.kotlin.gradle.tasks.KotlinJvmCompile>()
.configureEach {
compilerOptions {
jvmTarget.set(JvmTarget.JVM_19)
}
}
testing {

View File

@ -0,0 +1,10 @@
plugins {
id("docthor.kotlin-common-conventions")
`java-library`
}
dependencies {
api("io.github.oshai:kotlin-logging-jvm:5.1.0")
implementation("org.slf4j:slf4j-api:2.0.9")
}

View File

@ -1,6 +1,3 @@
# This file was generated by the Gradle 'init' task.
# https://docs.gradle.org/current/userguide/build_environment.html#sec:gradle_configuration_properties
org.gradle.parallel=true
org.gradle.caching=true

View File

@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2.1-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME

View File

@ -1,7 +1,3 @@
plugins {
id("docthor.kotlin-library-conventions")
}
dependencies {
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
}

View File

@ -1,26 +1,31 @@
package de.itkl.fileprocessing
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.annotation.KoinReflectAPI
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import kotlin.io.path.exists
private val Log = KotlinLogging.logger { }
abstract class FileProcessingPipeline {
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
protected abstract val fileProcessor: List<FileProcessor>
protected abstract val progressBarFactory: ProgressBarFactory
private val progressBarFactory: ProgressBarFactory by inject()
suspend fun input(file: File) {
var currentFile = file
fileProcessor.forEach { processor ->
val target = processor.willProduce(currentFile.toPath())
if(target.exists()) {
if(target.exists() && !force) {
Log.info { "$target exists. Skipping" }
} else {
Log.info { "$target does not exists. Creating" }
val resource = FileResource(currentFile)
val progress = ProgressResource(resource, progressBarFactory)
processor.process(progress)
Log.info { "File created: $target" }
}
currentFile = target.toFile()
}

View File

@ -2,8 +2,10 @@ package de.itkl.fileprocessing
interface ProgressBarFactory {
fun new(resource: Resource): ProgressBar
fun new(name: String, max: Long): ProgressBar
}
interface ProgressBar : AutoCloseable {
fun update(bytesRead: Long)
fun update(progressed: Long)
fun step()
}

View File

@ -1,35 +0,0 @@
package de.itkl.processing
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.toList
import kotlinx.coroutines.runBlocking
import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import java.util.concurrent.TimeUnit
@Suppress("UNCHECKED_CAST")
class ParallelFlowProcessor<T,U>(
private val mapperFn: (T) -> U) {
companion object {
private val workers = Executors.newWorkStealingPool(16)
}
suspend fun process(flow: Flow<T>): Flow<U> {
return flow {
flow.map { kotlinx.coroutines.Runnable {
val result = mapperFn(it)
runBlocking { emit(result) }
} }
.map { job -> workers.submit(job)}
.toList()
.forEach { future -> emit(future.get() as U) }
withContext(Dispatchers.IO) {
workers.awaitTermination(10000, TimeUnit.DAYS)
}
}
}
}

View File

@ -0,0 +1,43 @@
package de.itkl.processing
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.*
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.channels.consumeEach
import kotlinx.coroutines.flow.*
private val Log = KotlinLogging.logger { }
class ParallelUnorderedFlow<U>(
private val mapperFlow: Flow<U>
) : Flow<U> {
override suspend fun collect(collector: FlowCollector<U>) {
mapperFlow.collect(collector)
}
}
suspend fun <T : Any, U : Any> Flow<T>.parallelUnordered(
scope: CoroutineScope,
numWorkers: Int,
mapperFn: (T) -> U): Flow<U> {
val producerChannel = Channel<T>()
scope.launch(Dispatchers.Default) {
collect {
producerChannel.send(it)
}
producerChannel.close()
}
val mapperFlow = channelFlow {
(0..numWorkers).map {
launch(Dispatchers.Default) {
producerChannel.consumeEach {
send(mapperFn(it))
}
}
}
}
return ParallelUnorderedFlow(mapperFlow)
}

View File

@ -0,0 +1,3 @@
dependencies {
implementation("io.ktor:ktor-http-jvm:2.3.7")
}

View File

@ -0,0 +1,19 @@
package de.itkl.io.implementation
import de.itkl.io.interfaces.Resource
import io.ktor.http.*
import java.io.File
import java.io.InputStream
class FileSystemResource(private val file: File) : Resource() {
override val filename: String
get() = file.name
override val contentType: ContentType
get() = ContentType.fromFilePath(file.path).first()
override val length: Long
get() = file.length()
override fun doRead(): InputStream {
return file.inputStream()
}
}

View File

@ -0,0 +1,24 @@
package de.itkl.io.interfaces
import io.ktor.http.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.get
import org.koin.core.qualifier.named
import java.io.InputStream
import java.io.InputStreamReader
abstract class Resource : KoinComponent {
abstract val filename: String
abstract val contentType: ContentType
abstract val length: Long?
protected abstract fun doRead(): InputStream
fun read(): InputStream {
return length?.let { length ->
get<ResourceReadDecorator>().decorate(
length = length,
read()
)
} ?: read()
}
}

View File

@ -0,0 +1,15 @@
package de.itkl.io.interfaces
import java.io.InputStream
interface ResourceReadDecorator {
fun decorate(
length: Long,
inputStream: InputStream): InputStream
}
class NoopResourceReadDecorator : ResourceReadDecorator {
override fun decorate(length: Long, inputStream: InputStream): InputStream {
return inputStream
}
}

View File

@ -0,0 +1,9 @@
package de.itkl.io
import de.itkl.io.interfaces.NoopResourceReadDecorator
import de.itkl.io.interfaces.ResourceReadDecorator
import org.koin.dsl.module
val ioModule = module {
single<ResourceReadDecorator> { NoopResourceReadDecorator() }
}

View File

@ -1,9 +1,6 @@
plugins {
id("docthor.kotlin-library-conventions")
}
dependencies {
api("org.apache.lucene:lucene-analysis-common:9.9.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")
}

View File

@ -16,7 +16,8 @@ class BagOfWords(private val data: MutableSet<String> = mutableSetOf()) : Iterab
}
fun join(bagOfWords: BagOfWords): BagOfWords {
return BagOfWords(data.toMutableSet().apply { addAll(bagOfWords.data) })
data.addAll(bagOfWords.data)
return this
}
override fun iterator(): Iterator<String> {

View File

@ -0,0 +1,4 @@
package de.itkl.textprocessing
class DocumentContainer {
}

View File

@ -11,10 +11,16 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
}
}
fun fromBagOfWords(bagOfWords: BagOfWords): Histogram {
val result = Histogram()
bagOfWords.forEach(result::add)
return result
}
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
val result = Histogram()
flow.collectIndexed { index, value ->
println(index)
flow.collect() { value ->
value.forEach(result::add)
}
return result
@ -27,12 +33,24 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
}
}
fun join(other: Histogram): Histogram {
other.forEach { (word, count) ->
histo.merge(word, count) { a,b -> a + b }
}
return this
}
fun add(word: String) {
histo.compute(word) { _, count ->
count?.let { it + 1u } ?: 1u
}
}
fun set(word: String, count: Int) {
histo[word] = count.toUInt()
}
val size get() = histo.size
override fun iterator(): Iterator<Pair<String, UInt>> {
return iterator {

View File

@ -1,22 +1,19 @@
package de.itkl.textprocessing
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.onCompletion
import org.apache.lucene.analysis.standard.StandardTokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.AttributeFactory
import java.io.File
import kotlinx.coroutines.withContext
import java.io.InputStream
import java.io.InputStreamReader
class TextFile(val inputStream: InputStream) {
class TextFile(private val inputStream: InputStream) {
fun splitByEmptyLines(): Flow<List<String>> {
val reader = InputStreamReader(inputStream)
var list = mutableListOf<String>()
return flow {
return flow<List<String>> {
reader.useLines { lines ->
lines.forEach { line ->
if(line.isEmpty()) {
@ -27,21 +24,9 @@ class TextFile(val inputStream: InputStream) {
}
}
}
}.onCompletion {
withContext(Dispatchers.IO) {
reader.close()
} }
}
}
// fun words(progressOp: (read: Long) -> Unit = {}): Flow<String> {
// val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
// val tokenizer = StandardTokenizer(factory)
// val reader = ProgressInputStream(file.inputStream(), progressOp)
// tokenizer.setReader(InputStreamReader(reader))
// tokenizer.reset()
// val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
// return flow {
// while (kotlin.runCatching { tokenizer.incrementToken() }.getOrElse { true } ) {
// emit(attr.toString())
// }
// }.onCompletion {
// tokenizer.close()
// }
// }
}

View File

@ -1,23 +1,21 @@
package de.itkl.textprocessing
package de.itkl.textprocessing.implementation
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.onCompletion
import de.itkl.textprocessing.interfaces.Tokenizer
import org.apache.lucene.analysis.standard.StandardTokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.AttributeFactory
import java.io.StringReader
class Tokenizer {
class LuceneTokenizer : Tokenizer {
private val tokenizer by lazy {
val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
val tokenizer = StandardTokenizer(factory)
tokenizer
}
fun tokenize(input: String): Sequence<String> {
val reader = StringReader(input)
override fun tokenize(text: String): Sequence<String> {
val reader = StringReader(text)
tokenizer.setReader(reader)
tokenizer.reset()
val attr = tokenizer.addAttribute(CharTermAttribute::class.java)

View File

@ -0,0 +1,13 @@
package de.itkl.textprocessing.implementation
import de.itkl.textprocessing.interfaces.Stemmer
import org.tartarus.snowball.ext.GermanStemmer
class SnowballStemmerGerman : Stemmer {
private val german = GermanStemmer()
override fun stem(word: String): String {
german.current = word
german.stem()
return german.current
}
}

View File

@ -0,0 +1,3 @@
package de.itkl.textprocessing.interfaces
interface DocumentAssetManager {}

View File

@ -0,0 +1,4 @@
package de.itkl.textprocessing.interfaces
interface DocumentExtractor {
}

View File

@ -0,0 +1,5 @@
package de.itkl.textprocessing.interfaces
interface Stemmer {
fun stem(word: String): String
}

View File

@ -0,0 +1,5 @@
package de.itkl.textprocessing.interfaces
interface Tokenizer {
fun tokenize(text: String): Sequence<String>
}

View File

@ -0,0 +1,12 @@
package de.itkl.textprocessing
import de.itkl.textprocessing.implementation.LuceneTokenizer
import de.itkl.textprocessing.implementation.SnowballStemmerGerman
import de.itkl.textprocessing.interfaces.Stemmer
import de.itkl.textprocessing.interfaces.Tokenizer
import org.koin.dsl.module
val textProcessingModule = module {
factory<Tokenizer> { LuceneTokenizer() }
factory<Stemmer> { SnowballStemmerGerman() }
}

View File

@ -1,10 +1,7 @@
plugins {
id("docthor.kotlin-library-conventions")
}
dependencies {
api(project(":libraries:textprocessing"))
api(project(":libraries:fileprocessing"))
implementation("com.github.ajalt.mordant:mordant:2.2.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")
}

View File

@ -0,0 +1,57 @@
package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource
import de.itkl.processing.parallelUnordered
import de.itkl.textprocessing.*
import de.itkl.textprocessing.interfaces.Stemmer
import de.itkl.textprocessing.interfaces.Tokenizer
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.*
import kotlinx.coroutines.flow.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.max
private val Log = KotlinLogging.logger { }
class DocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-document-frequency.csv")
}
override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile()
val (numDocs, histogram) = TextFile(resource.read())
.splitByEmptyLines()
.withIndex()
.parallelUnordered(this, 16) { (index, doc) ->
val result = collectWordsOfDocument(doc)
index to result
}
.reduce { (index, acc), (otherIndex, other) -> max(index, otherIndex) to acc.join(other)}
Log.info { "Writing CSV $resultFile" }
histogram.set("\$numDocs", numDocs)
HistogramCsvStorage().save(histogram, resultFile)
resultFile
}
private fun collectWordsOfDocument(document: List<String>): Histogram {
if (document.isEmpty()) {
return Histogram()
}
val tokenizer: Tokenizer by inject()
val stemmer: Stemmer by inject()
val bagOfWords = document.map { line ->
val tokens = tokenizer.tokenize(line)
BagOfWords.from(tokens.map { stemmer.stem(it) })
}
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
return Histogram.fromBagOfWords(bagOfWords)
}
}

View File

@ -1,46 +0,0 @@
package de.itkl.tfidf
import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource
import de.itkl.processing.ParallelFlowProcessor
import de.itkl.textprocessing.*
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.reduce
import kotlinx.coroutines.flow.take
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
private val Log = KotlinLogging.logger { }
class Idf : FileProcessor {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-idf.csv")
}
override suspend fun process(resource: Resource): File {
Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile()
val textFile = TextFile(resource.read())
val documents = textFile.splitByEmptyLines()
val bagOfWords = ParallelFlowProcessor<List<String>, BagOfWords>(
mapperFn = { document ->
val tokenizer = Tokenizer()
val bagOfWords = document.map { line ->
val tokens = tokenizer.tokenize(line)
BagOfWords.from(tokens)
}
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
bagOfWords
}
).process(documents)
val histogram = Histogram.fromBagOfWords(bagOfWords)
HistogramCsvStorage().save(histogram, resultFile)
return resultFile
}
}

View File

@ -0,0 +1,47 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.textprocessing.HistogramCsvStorage
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.ln
import kotlin.math.log
import kotlin.math.log10
import kotlin.math.log2
private val Log = KotlinLogging.logger { }
class InverseDocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
}
override suspend fun process(resource: Resource): File {
val histogram = HistogramCsvStorage().read(resource.toFile())
val numDocs = histogram
.find { (word, count) -> word == "\$numDocs" }!!
.second.toInt()
val progressBarFactory: ProgressBarFactory by inject()
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
writeRow("word", "idf")
histogram.forEach { (word, count) ->
writeRow(word, idf(numDocs, count))
progess.step()
}
}
resource.path.toFile()
}
}
private fun idf(numDocs: Int, count: UInt): Double {
return log10(numDocs / count.toDouble())
}
}

View File

@ -19,6 +19,18 @@ class TerminalProgressBarFactory : ProgressBarFactory {
}
return TerminalProgressBar(animation, resource.length())
}
override fun new(name: String, max: Long): ProgressBar {
val animation = terminal.progressAnimation {
text(name)
percentage()
progressBar()
completed()
timeRemaining()
}
return TerminalProgressBar(animation, max)
}
}
class TerminalProgressBar(
@ -28,11 +40,16 @@ class TerminalProgressBar(
animation.start()
animation.updateTotal(total)
}
override fun update(bytesRead: Long) {
animation.update(bytesRead)
override fun update(progressed: Long) {
animation.update(progressed)
}
override fun step() {
animation.advance()
}
override fun close() {
animation.stop()
println()
}
}

View File

@ -1,9 +0,0 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.textprocessing.Histogram
import io.github.oshai.kotlinlogging.KotlinLogging
import java.io.File
private val Log = KotlinLogging.logger { }

View File

@ -1,55 +0,0 @@
package de.itkl.tfidf
import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.textprocessing.Histogram
import de.itkl.textprocessing.HistogramCsvStorage
import de.itkl.textprocessing.TextFile
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.flow.map
import org.tartarus.snowball.SnowballStemmer
import org.tartarus.snowball.ext.GermanStemmer
import java.io.File
import kotlin.io.path.exists
private val Log = KotlinLogging.logger { }
//class TfIdf {
// suspend fun computeTf(
// corpus: File,
// language: Language
// ): Histogram {
// Log.info { "Processing $corpus" }
// val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv")
//
// if(destination.exists()) {
// return HistogramCsvStorage().read(destination.toFile())
// }
//
// val filesize = corpus.length()
//
// val t = Terminal()
// val histogram = t.progressBar("Indexing ${corpus.name}", filesize) { val stemmer = stemmer(language)
// val words = TextFile(corpus).words {readBytes -> update(readBytes)}
// .map { stemmer.stem(it) }
// Histogram.from(words)
// }
//
// t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) {
// HistogramCsvStorage()
// .save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)}
// }
// return histogram
// }
//
// private fun stemmer(language: Language): SnowballStemmer {
// return when(language) {
// Language.DE -> GermanStemmer()
// }
// }
//
// private fun SnowballStemmer.stem(word: String): String {
// current = word
// stem()
// return current
// }
//}

View File

@ -3,11 +3,11 @@ package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent
class TfIdfPipeline(private val language: Language) : FileProcessingPipeline() {
override val fileProcessor = listOf(
Idf()
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
override val fileProcessor = listOf<FileProcessor>(
DocumentFrequency(),
InverseDocumentFrequency()
)
override val progressBarFactory: ProgressBarFactory
get() = TerminalProgressBarFactory()
}

View File

@ -1,21 +0,0 @@
package de.itkl.tfidf
import com.github.ajalt.mordant.animation.ProgressAnimation
import com.github.ajalt.mordant.animation.progressAnimation
import com.github.ajalt.mordant.terminal.Terminal
import java.awt.SystemColor.text
suspend fun <T> Terminal.progressBar(name: String, overall: Long, context: suspend ProgressAnimation.() -> T):T {
val progress = progressAnimation {
text(name)
percentage()
progressBar()
completed()
timeRemaining()
}
progress.start()
progress.updateTotal(overall)
val result = context(progress)
progress.stop()
return result
}

View File

@ -1,15 +1,25 @@
pluginManagement {
includeBuild("build-logic")
}
//pluginManagement {
// includeBuild("build-logic")
//}
plugins {
id("org.gradle.toolchains.foojay-resolver-convention") version "0.4.0"
}
fun includeDir(path: String) {
file(path)
.listFiles()!!
.filter { it.isDirectory }
.filter { dir ->
dir.resolve("build.gradle.kts").exists() }
.forEach { dir ->
val includeString = listOf(path, dir.name).joinToString(":")
include(includeString)
}
}
rootProject.name = "docthor"
include(
"app",
"libraries:tfidf",
"libraries:textprocessing",
"libraries:fileprocessing",
)
includeDir("libraries")