Compare commits

..

No commits in common. "78af3f0d502d1d47503e2173dc00a6d66e6a35e3" and "71e066fcdededf8688fea789875cb1b4ade21528" have entirely different histories.

44 changed files with 295 additions and 413 deletions

View File

@ -6,17 +6,14 @@ import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.required import com.github.ajalt.clikt.parameters.options.required
import com.github.ajalt.clikt.parameters.types.enum import com.github.ajalt.clikt.parameters.types.enum
import com.github.ajalt.clikt.parameters.types.file import com.github.ajalt.clikt.parameters.types.file
import de.itkl.fileprocessing.ProgressBarFactory import de.itkl.textprocessing.TextFile
import de.itkl.textprocessing.textProcessingModule
import de.itkl.tfidf.Language import de.itkl.tfidf.Language
import de.itkl.tfidf.TerminalProgressBarFactory
//import de.itkl.tfidf.TfIdf //import de.itkl.tfidf.TfIdf
import de.itkl.tfidf.TfIdfPipeline import de.itkl.tfidf.TfIdfPipeline
import kotlinx.coroutines.flow.take
import kotlinx.coroutines.runBlocking import kotlinx.coroutines.runBlocking
import org.koin.core.context.startKoin
import org.koin.dsl.module
class ComputeIdf : CliktCommand() { class ComputeTf : CliktCommand() {
private val corpus by option(help = "corpus") private val corpus by option(help = "corpus")
.file() .file()
.required() .required()
@ -25,20 +22,18 @@ class ComputeIdf : CliktCommand() {
.required() .required()
override fun run() = runBlocking { override fun run() = runBlocking {
TfIdfPipeline(force = true) TfIdfPipeline(language = Language.DE)
.input(corpus) .input(corpus)
// TextFile(corpus).splitByEmptyLines()
// .take(10)
// .collect { println(it) }
// val tfIdf = TfIdf()
// val histogram = tfIdf.computeTf(
// corpus,
// language
// )
// val tf = tfIdf.normalizeTf(histogram, corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-tf.csv").toFile())
} }
} }
fun main(args: Array<String>) { fun main(args: Array<String>) = ComputeTf().main(args)
startKoin {
modules(
textProcessingModule,
module {
single<ProgressBarFactory> {
TerminalProgressBarFactory()
}
})
ComputeIdf().main(args)
}
}

View File

@ -0,0 +1,19 @@
/*
* This file was generated by the Gradle 'init' task.
*
* This project uses @Incubating APIs which are subject to change.
*/
plugins {
// Support convention plugins written in Kotlin. Convention plugins are build scripts in 'src/main' that automatically become available as plugins in the main build.
`kotlin-dsl`
}
repositories {
// Use the plugin portal to apply community plugins in convention plugins.
gradlePluginPortal()
}
dependencies {
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.20")
}

View File

@ -0,0 +1,8 @@
/*
* This file was generated by the Gradle 'init' task.
*
* This settings file is used to specify which projects to include in your build-logic build.
* This project uses @Incubating APIs which are subject to change.
*/
rootProject.name = "docthor-build-logic"

View File

@ -1,5 +1,4 @@
import org.gradle.api.plugins.jvm.JvmTestSuite import org.gradle.api.plugins.jvm.JvmTestSuite
import org.jetbrains.kotlin.gradle.dsl.JvmTarget
plugins { plugins {
id("org.jetbrains.kotlin.jvm") id("org.jetbrains.kotlin.jvm")
@ -10,25 +9,9 @@ repositories {
} }
dependencies { dependencies {
val koin_version = "3.5.3"
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3") implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
implementation("io.insert-koin:koin-core:$koin_version")
} }
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of("19"))
}
}
tasks
.withType<org.jetbrains.kotlin.gradle.tasks.KotlinJvmCompile>()
.configureEach {
compilerOptions {
jvmTarget.set(JvmTarget.JVM_19)
}
}
testing { testing {
suites { suites {
// Configure the built-in test suite // Configure the built-in test suite

View File

@ -0,0 +1,22 @@
import org.codehaus.groovy.tools.shell.util.Logger.io
/*
* This file was generated by the Gradle 'init' task.
*
* This project uses @Incubating APIs which are subject to change.
*/
plugins {
// Apply the common convention plugin for shared build configuration between library and application projects.
id("docthor.kotlin-common-conventions")
// Apply the java-library plugin for API and implementation separation.
`java-library`
}
dependencies {
api("io.github.oshai:kotlin-logging-jvm:5.1.0")
implementation("org.slf4j:slf4j-api:2.0.9")
}

View File

@ -1,3 +0,0 @@
project(":libraries").subprojects {
apply(plugin = "docthor.kotlin-library-conventions")
}

View File

@ -1,11 +0,0 @@
plugins {
`kotlin-dsl`
}
repositories {
gradlePluginPortal()
}
dependencies {
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.20")
}

View File

@ -1 +0,0 @@
rootProject.name = "docthor-build-logic"

View File

@ -1,10 +0,0 @@
plugins {
id("docthor.kotlin-common-conventions")
`java-library`
}
dependencies {
api("io.github.oshai:kotlin-logging-jvm:5.1.0")
implementation("org.slf4j:slf4j-api:2.0.9")
}

View File

@ -1,3 +1,6 @@
# This file was generated by the Gradle 'init' task.
# https://docs.gradle.org/current/userguide/build_environment.html#sec:gradle_configuration_properties
org.gradle.parallel=true org.gradle.parallel=true
org.gradle.caching=true org.gradle.caching=true

View File

@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip distributionUrl=https\://services.gradle.org/distributions/gradle-8.2.1-bin.zip
networkTimeout=10000 networkTimeout=10000
validateDistributionUrl=true validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME zipStoreBase=GRADLE_USER_HOME

View File

@ -1,3 +1,7 @@
plugins {
id("docthor.kotlin-library-conventions")
}
dependencies { dependencies {
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3") implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
} }

View File

@ -1,31 +1,26 @@
package de.itkl.fileprocessing package de.itkl.fileprocessing
import io.github.oshai.kotlinlogging.KotlinLogging import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.annotation.KoinReflectAPI
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File import java.io.File
import kotlin.io.path.exists import kotlin.io.path.exists
private val Log = KotlinLogging.logger { } private val Log = KotlinLogging.logger { }
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent { abstract class FileProcessingPipeline {
protected abstract val fileProcessor: List<FileProcessor> protected abstract val fileProcessor: List<FileProcessor>
private val progressBarFactory: ProgressBarFactory by inject() protected abstract val progressBarFactory: ProgressBarFactory
suspend fun input(file: File) { suspend fun input(file: File) {
var currentFile = file var currentFile = file
fileProcessor.forEach { processor -> fileProcessor.forEach { processor ->
val target = processor.willProduce(currentFile.toPath()) val target = processor.willProduce(currentFile.toPath())
if(target.exists() && !force) { if(target.exists()) {
Log.info { "$target exists. Skipping" } Log.info { "$target exists. Skipping" }
} else { } else {
Log.info { "$target does not exists. Creating" } Log.info { "$target does not exists. Creating" }
val resource = FileResource(currentFile) val resource = FileResource(currentFile)
val progress = ProgressResource(resource, progressBarFactory) val progress = ProgressResource(resource, progressBarFactory)
processor.process(progress) processor.process(progress)
Log.info { "File created: $target" }
} }
currentFile = target.toFile() currentFile = target.toFile()
} }

View File

@ -2,10 +2,8 @@ package de.itkl.fileprocessing
interface ProgressBarFactory { interface ProgressBarFactory {
fun new(resource: Resource): ProgressBar fun new(resource: Resource): ProgressBar
fun new(name: String, max: Long): ProgressBar
} }
interface ProgressBar : AutoCloseable { interface ProgressBar : AutoCloseable {
fun update(progressed: Long) fun update(bytesRead: Long)
fun step()
} }

View File

@ -0,0 +1,35 @@
package de.itkl.processing
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.toList
import kotlinx.coroutines.runBlocking
import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import java.util.concurrent.TimeUnit
@Suppress("UNCHECKED_CAST")
class ParallelFlowProcessor<T,U>(
private val mapperFn: (T) -> U) {
companion object {
private val workers = Executors.newWorkStealingPool(16)
}
suspend fun process(flow: Flow<T>): Flow<U> {
return flow {
flow.map { kotlinx.coroutines.Runnable {
val result = mapperFn(it)
runBlocking { emit(result) }
} }
.map { job -> workers.submit(job)}
.toList()
.forEach { future -> emit(future.get() as U) }
withContext(Dispatchers.IO) {
workers.awaitTermination(10000, TimeUnit.DAYS)
}
}
}
}

View File

@ -1,43 +0,0 @@
package de.itkl.processing
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.*
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.channels.consumeEach
import kotlinx.coroutines.flow.*
private val Log = KotlinLogging.logger { }
class ParallelUnorderedFlow<U>(
private val mapperFlow: Flow<U>
) : Flow<U> {
override suspend fun collect(collector: FlowCollector<U>) {
mapperFlow.collect(collector)
}
}
suspend fun <T : Any, U : Any> Flow<T>.parallelUnordered(
scope: CoroutineScope,
numWorkers: Int,
mapperFn: (T) -> U): Flow<U> {
val producerChannel = Channel<T>()
scope.launch(Dispatchers.Default) {
collect {
producerChannel.send(it)
}
producerChannel.close()
}
val mapperFlow = channelFlow {
(0..numWorkers).map {
launch(Dispatchers.Default) {
producerChannel.consumeEach {
send(mapperFn(it))
}
}
}
}
return ParallelUnorderedFlow(mapperFlow)
}

View File

@ -1,3 +0,0 @@
dependencies {
implementation("io.ktor:ktor-http-jvm:2.3.7")
}

View File

@ -1,19 +0,0 @@
package de.itkl.io.implementation
import de.itkl.io.interfaces.Resource
import io.ktor.http.*
import java.io.File
import java.io.InputStream
class FileSystemResource(private val file: File) : Resource() {
override val filename: String
get() = file.name
override val contentType: ContentType
get() = ContentType.fromFilePath(file.path).first()
override val length: Long
get() = file.length()
override fun doRead(): InputStream {
return file.inputStream()
}
}

View File

@ -1,24 +0,0 @@
package de.itkl.io.interfaces
import io.ktor.http.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.get
import org.koin.core.qualifier.named
import java.io.InputStream
import java.io.InputStreamReader
abstract class Resource : KoinComponent {
abstract val filename: String
abstract val contentType: ContentType
abstract val length: Long?
protected abstract fun doRead(): InputStream
fun read(): InputStream {
return length?.let { length ->
get<ResourceReadDecorator>().decorate(
length = length,
read()
)
} ?: read()
}
}

View File

@ -1,15 +0,0 @@
package de.itkl.io.interfaces
import java.io.InputStream
interface ResourceReadDecorator {
fun decorate(
length: Long,
inputStream: InputStream): InputStream
}
class NoopResourceReadDecorator : ResourceReadDecorator {
override fun decorate(length: Long, inputStream: InputStream): InputStream {
return inputStream
}
}

View File

@ -1,9 +0,0 @@
package de.itkl.io
import de.itkl.io.interfaces.NoopResourceReadDecorator
import de.itkl.io.interfaces.ResourceReadDecorator
import org.koin.dsl.module
val ioModule = module {
single<ResourceReadDecorator> { NoopResourceReadDecorator() }
}

View File

@ -1,6 +1,9 @@
plugins {
id("docthor.kotlin-library-conventions")
}
dependencies { dependencies {
api("org.apache.lucene:lucene-analysis-common:9.9.0") api("org.apache.lucene:lucene-analysis-common:9.9.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2") implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")
} }

View File

@ -16,8 +16,7 @@ class BagOfWords(private val data: MutableSet<String> = mutableSetOf()) : Iterab
} }
fun join(bagOfWords: BagOfWords): BagOfWords { fun join(bagOfWords: BagOfWords): BagOfWords {
data.addAll(bagOfWords.data) return BagOfWords(data.toMutableSet().apply { addAll(bagOfWords.data) })
return this
} }
override fun iterator(): Iterator<String> { override fun iterator(): Iterator<String> {

View File

@ -1,4 +0,0 @@
package de.itkl.textprocessing
class DocumentContainer {
}

View File

@ -11,16 +11,10 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
} }
} }
fun fromBagOfWords(bagOfWords: BagOfWords): Histogram {
val result = Histogram()
bagOfWords.forEach(result::add)
return result
}
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram { suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
val result = Histogram() val result = Histogram()
flow.collect() { value -> flow.collectIndexed { index, value ->
println(index)
value.forEach(result::add) value.forEach(result::add)
} }
return result return result
@ -33,24 +27,12 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
} }
} }
fun join(other: Histogram): Histogram {
other.forEach { (word, count) ->
histo.merge(word, count) { a,b -> a + b }
}
return this
}
fun add(word: String) { fun add(word: String) {
histo.compute(word) { _, count -> histo.compute(word) { _, count ->
count?.let { it + 1u } ?: 1u count?.let { it + 1u } ?: 1u
} }
} }
fun set(word: String, count: Int) {
histo[word] = count.toUInt()
}
val size get() = histo.size val size get() = histo.size
override fun iterator(): Iterator<Pair<String, UInt>> { override fun iterator(): Iterator<Pair<String, UInt>> {
return iterator { return iterator {

View File

@ -1,19 +1,22 @@
package de.itkl.textprocessing package de.itkl.textprocessing
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.onCompletion import kotlinx.coroutines.flow.onCompletion
import kotlinx.coroutines.withContext import org.apache.lucene.analysis.standard.StandardTokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.AttributeFactory
import java.io.File
import java.io.InputStream import java.io.InputStream
import java.io.InputStreamReader import java.io.InputStreamReader
class TextFile(private val inputStream: InputStream) { class TextFile(val inputStream: InputStream) {
fun splitByEmptyLines(): Flow<List<String>> { fun splitByEmptyLines(): Flow<List<String>> {
val reader = InputStreamReader(inputStream) val reader = InputStreamReader(inputStream)
var list = mutableListOf<String>() var list = mutableListOf<String>()
return flow<List<String>> { return flow {
reader.useLines { lines -> reader.useLines { lines ->
lines.forEach { line -> lines.forEach { line ->
if(line.isEmpty()) { if(line.isEmpty()) {
@ -24,9 +27,21 @@ class TextFile(private val inputStream: InputStream) {
} }
} }
} }
}.onCompletion {
withContext(Dispatchers.IO) {
reader.close()
} }
} }
}
// fun words(progressOp: (read: Long) -> Unit = {}): Flow<String> {
// val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
// val tokenizer = StandardTokenizer(factory)
// val reader = ProgressInputStream(file.inputStream(), progressOp)
// tokenizer.setReader(InputStreamReader(reader))
// tokenizer.reset()
// val attr = tokenizer.addAttribute(CharTermAttribute::class.java)
// return flow {
// while (kotlin.runCatching { tokenizer.incrementToken() }.getOrElse { true } ) {
// emit(attr.toString())
// }
// }.onCompletion {
// tokenizer.close()
// }
// }
} }

View File

@ -1,21 +1,23 @@
package de.itkl.textprocessing.implementation package de.itkl.textprocessing
import de.itkl.textprocessing.interfaces.Tokenizer import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.onCompletion
import org.apache.lucene.analysis.standard.StandardTokenizer import org.apache.lucene.analysis.standard.StandardTokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.AttributeFactory import org.apache.lucene.util.AttributeFactory
import java.io.StringReader import java.io.StringReader
class LuceneTokenizer : Tokenizer { class Tokenizer {
private val tokenizer by lazy { private val tokenizer by lazy {
val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY val factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
val tokenizer = StandardTokenizer(factory) val tokenizer = StandardTokenizer(factory)
tokenizer tokenizer
} }
override fun tokenize(text: String): Sequence<String> { fun tokenize(input: String): Sequence<String> {
val reader = StringReader(text) val reader = StringReader(input)
tokenizer.setReader(reader) tokenizer.setReader(reader)
tokenizer.reset() tokenizer.reset()
val attr = tokenizer.addAttribute(CharTermAttribute::class.java) val attr = tokenizer.addAttribute(CharTermAttribute::class.java)

View File

@ -1,13 +0,0 @@
package de.itkl.textprocessing.implementation
import de.itkl.textprocessing.interfaces.Stemmer
import org.tartarus.snowball.ext.GermanStemmer
class SnowballStemmerGerman : Stemmer {
private val german = GermanStemmer()
override fun stem(word: String): String {
german.current = word
german.stem()
return german.current
}
}

View File

@ -1,3 +0,0 @@
package de.itkl.textprocessing.interfaces
interface DocumentAssetManager {}

View File

@ -1,4 +0,0 @@
package de.itkl.textprocessing.interfaces
interface DocumentExtractor {
}

View File

@ -1,5 +0,0 @@
package de.itkl.textprocessing.interfaces
interface Stemmer {
fun stem(word: String): String
}

View File

@ -1,5 +0,0 @@
package de.itkl.textprocessing.interfaces
interface Tokenizer {
fun tokenize(text: String): Sequence<String>
}

View File

@ -1,12 +0,0 @@
package de.itkl.textprocessing
import de.itkl.textprocessing.implementation.LuceneTokenizer
import de.itkl.textprocessing.implementation.SnowballStemmerGerman
import de.itkl.textprocessing.interfaces.Stemmer
import de.itkl.textprocessing.interfaces.Tokenizer
import org.koin.dsl.module
val textProcessingModule = module {
factory<Tokenizer> { LuceneTokenizer() }
factory<Stemmer> { SnowballStemmerGerman() }
}

View File

@ -1,7 +1,10 @@
plugins {
id("docthor.kotlin-library-conventions")
}
dependencies { dependencies {
api(project(":libraries:textprocessing")) api(project(":libraries:textprocessing"))
api(project(":libraries:fileprocessing")) api(project(":libraries:fileprocessing"))
implementation("com.github.ajalt.mordant:mordant:2.2.0") implementation("com.github.ajalt.mordant:mordant:2.2.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2") implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")
} }

View File

@ -1,57 +0,0 @@
package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource
import de.itkl.processing.parallelUnordered
import de.itkl.textprocessing.*
import de.itkl.textprocessing.interfaces.Stemmer
import de.itkl.textprocessing.interfaces.Tokenizer
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.*
import kotlinx.coroutines.flow.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.max
private val Log = KotlinLogging.logger { }
class DocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-document-frequency.csv")
}
override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile()
val (numDocs, histogram) = TextFile(resource.read())
.splitByEmptyLines()
.withIndex()
.parallelUnordered(this, 16) { (index, doc) ->
val result = collectWordsOfDocument(doc)
index to result
}
.reduce { (index, acc), (otherIndex, other) -> max(index, otherIndex) to acc.join(other)}
Log.info { "Writing CSV $resultFile" }
histogram.set("\$numDocs", numDocs)
HistogramCsvStorage().save(histogram, resultFile)
resultFile
}
private fun collectWordsOfDocument(document: List<String>): Histogram {
if (document.isEmpty()) {
return Histogram()
}
val tokenizer: Tokenizer by inject()
val stemmer: Stemmer by inject()
val bagOfWords = document.map { line ->
val tokens = tokenizer.tokenize(line)
BagOfWords.from(tokens.map { stemmer.stem(it) })
}
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
return Histogram.fromBagOfWords(bagOfWords)
}
}

View File

@ -0,0 +1,46 @@
package de.itkl.tfidf
import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource
import de.itkl.processing.ParallelFlowProcessor
import de.itkl.textprocessing.*
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.reduce
import kotlinx.coroutines.flow.take
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
private val Log = KotlinLogging.logger { }
class Idf : FileProcessor {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-idf.csv")
}
override suspend fun process(resource: Resource): File {
Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile()
val textFile = TextFile(resource.read())
val documents = textFile.splitByEmptyLines()
val bagOfWords = ParallelFlowProcessor<List<String>, BagOfWords>(
mapperFn = { document ->
val tokenizer = Tokenizer()
val bagOfWords = document.map { line ->
val tokens = tokenizer.tokenize(line)
BagOfWords.from(tokens)
}
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
bagOfWords
}
).process(documents)
val histogram = Histogram.fromBagOfWords(bagOfWords)
HistogramCsvStorage().save(histogram, resultFile)
return resultFile
}
}

View File

@ -1,47 +0,0 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.textprocessing.HistogramCsvStorage
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.ln
import kotlin.math.log
import kotlin.math.log10
import kotlin.math.log2
private val Log = KotlinLogging.logger { }
class InverseDocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
}
override suspend fun process(resource: Resource): File {
val histogram = HistogramCsvStorage().read(resource.toFile())
val numDocs = histogram
.find { (word, count) -> word == "\$numDocs" }!!
.second.toInt()
val progressBarFactory: ProgressBarFactory by inject()
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
writeRow("word", "idf")
histogram.forEach { (word, count) ->
writeRow(word, idf(numDocs, count))
progess.step()
}
}
resource.path.toFile()
}
}
private fun idf(numDocs: Int, count: UInt): Double {
return log10(numDocs / count.toDouble())
}
}

View File

@ -19,18 +19,6 @@ class TerminalProgressBarFactory : ProgressBarFactory {
} }
return TerminalProgressBar(animation, resource.length()) return TerminalProgressBar(animation, resource.length())
} }
override fun new(name: String, max: Long): ProgressBar {
val animation = terminal.progressAnimation {
text(name)
percentage()
progressBar()
completed()
timeRemaining()
}
return TerminalProgressBar(animation, max)
}
} }
class TerminalProgressBar( class TerminalProgressBar(
@ -40,16 +28,11 @@ class TerminalProgressBar(
animation.start() animation.start()
animation.updateTotal(total) animation.updateTotal(total)
} }
override fun update(progressed: Long) { override fun update(bytesRead: Long) {
animation.update(progressed) animation.update(bytesRead)
}
override fun step() {
animation.advance()
} }
override fun close() { override fun close() {
animation.stop() animation.stop()
println()
} }
} }

View File

@ -0,0 +1,9 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.textprocessing.Histogram
import io.github.oshai.kotlinlogging.KotlinLogging
import java.io.File
private val Log = KotlinLogging.logger { }

View File

@ -0,0 +1,55 @@
package de.itkl.tfidf
import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.textprocessing.Histogram
import de.itkl.textprocessing.HistogramCsvStorage
import de.itkl.textprocessing.TextFile
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.flow.map
import org.tartarus.snowball.SnowballStemmer
import org.tartarus.snowball.ext.GermanStemmer
import java.io.File
import kotlin.io.path.exists
private val Log = KotlinLogging.logger { }
//class TfIdf {
// suspend fun computeTf(
// corpus: File,
// language: Language
// ): Histogram {
// Log.info { "Processing $corpus" }
// val destination = corpus.toPath().parent.resolve("${corpus.nameWithoutExtension}-terms.csv")
//
// if(destination.exists()) {
// return HistogramCsvStorage().read(destination.toFile())
// }
//
// val filesize = corpus.length()
//
// val t = Terminal()
// val histogram = t.progressBar("Indexing ${corpus.name}", filesize) { val stemmer = stemmer(language)
// val words = TextFile(corpus).words {readBytes -> update(readBytes)}
// .map { stemmer.stem(it) }
// Histogram.from(words)
// }
//
// t.progressBar("Saving ${histogram.size} entries", histogram.size.toLong()) {
// HistogramCsvStorage()
// .save(histogram,destination.toFile()) { entriesWritten -> update(entriesWritten)}
// }
// return histogram
// }
//
// private fun stemmer(language: Language): SnowballStemmer {
// return when(language) {
// Language.DE -> GermanStemmer()
// }
// }
//
// private fun SnowballStemmer.stem(word: String): String {
// current = word
// stem()
// return current
// }
//}

View File

@ -3,11 +3,11 @@ package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessingPipeline import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.FileProcessor import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) { class TfIdfPipeline(private val language: Language) : FileProcessingPipeline() {
override val fileProcessor = listOf<FileProcessor>( override val fileProcessor = listOf(
DocumentFrequency(), Idf()
InverseDocumentFrequency()
) )
override val progressBarFactory: ProgressBarFactory
get() = TerminalProgressBarFactory()
} }

View File

@ -0,0 +1,21 @@
package de.itkl.tfidf
import com.github.ajalt.mordant.animation.ProgressAnimation
import com.github.ajalt.mordant.animation.progressAnimation
import com.github.ajalt.mordant.terminal.Terminal
import java.awt.SystemColor.text
suspend fun <T> Terminal.progressBar(name: String, overall: Long, context: suspend ProgressAnimation.() -> T):T {
val progress = progressAnimation {
text(name)
percentage()
progressBar()
completed()
timeRemaining()
}
progress.start()
progress.updateTotal(overall)
val result = context(progress)
progress.stop()
return result
}

View File

@ -1,25 +1,15 @@
//pluginManagement { pluginManagement {
// includeBuild("build-logic") includeBuild("build-logic")
//} }
plugins { plugins {
id("org.gradle.toolchains.foojay-resolver-convention") version "0.4.0" id("org.gradle.toolchains.foojay-resolver-convention") version "0.4.0"
} }
fun includeDir(path: String) {
file(path)
.listFiles()!!
.filter { it.isDirectory }
.filter { dir ->
dir.resolve("build.gradle.kts").exists() }
.forEach { dir ->
val includeString = listOf(path, dir.name).joinToString(":")
include(includeString)
}
}
rootProject.name = "docthor" rootProject.name = "docthor"
include( include(
"app", "app",
"libraries:tfidf",
"libraries:textprocessing",
"libraries:fileprocessing",
) )
includeDir("libraries")