refactoring into parallelUnordered method

2023-12-18 22:55:29 +01:00 · 2023-12-18 22:55:29 +01:00 · 4cafac4583
parent 13110fa8e5
commit 4cafac4583
7 changed files with 69 additions and 68 deletions
--- a/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/ParallelFlowProcessor.kt
+++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/ParallelFlowProcessor.kt
@ -1,37 +0,0 @@
 package de.itkl.processing
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.flow
 import kotlinx.coroutines.flow.map
 import kotlinx.coroutines.flow.toList
 import kotlinx.coroutines.runBlocking
 import kotlinx.coroutines.withContext
 import java.util.concurrent.Executors
 import java.util.concurrent.TimeUnit
@Suppress("UNCHECKED_CAST")
 class ParallelFlowProcessor<T,U>(
    private val mapperFn: (T) -> U) {
    companion object {
        private val workers = Executors.newWorkStealingPool(16)
    }
    suspend fun process(flow: Flow<T>): Flow<U> {
        TODO()
 //        flow.map {  }
 //        return flow {
 //            flow.map { kotlinx.coroutines.Runnable {
 //                val result = mapperFn(it)
 //                runBlocking { emit(result) }
 //            } }
 //                .map { job -> workers.submit(job)}
 //                .toList()
 //                .forEach { future -> emit(future.get() as U) }
 //            withContext(Dispatchers.IO) {
 //                workers.awaitTermination(10000, TimeUnit.DAYS)
 //            }
 //        }
    }
 }
--- a/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/UnorderedParallelFlowMap.kt
+++ b/libraries/fileprocessing/src/main/kotlin/de/itkl/processing/UnorderedParallelFlowMap.kt
@ -0,0 +1,41 @@
 package de.itkl.processing
 import kotlinx.coroutines.*
 import kotlinx.coroutines.channels.Channel
 import kotlinx.coroutines.channels.consumeEach
 import kotlinx.coroutines.flow.*
 class ParallelUnorderedFlow<U>(
    private val producerJob: Job,
    private val mapperFlow: Flow<U>
 ) : Flow<U>  {
    override suspend fun collect(collector: FlowCollector<U>) {
        mapperFlow.collect(collector)
        producerJob.join()
    }
 }
 suspend fun <T : Any, U : Any> Flow<T>.parallelUnordered(
    scope: CoroutineScope,
    numWorkers: Int,
    mapperFn: (T) -> U): Flow<U> {
    val producerChannel = Channel<T>()
    val producersJob = scope.launch(Dispatchers.Default) {
        collect {
            producerChannel.send(it)}
    }
    val mapperFlow = channelFlow {
        (0..numWorkers).map { consumer ->
            launch(Dispatchers.Default) {
                producerChannel.consumeEach {
                    send(mapperFn(it))
                }
            }
        }
    }
    return ParallelUnorderedFlow(producersJob, mapperFlow)
 }
--- a/libraries/textprocessing/build.gradle.kts
+++ b/libraries/textprocessing/build.gradle.kts
@ -5,5 +5,6 @@ plugins {
 dependencies {
    api("org.apache.lucene:lucene-analysis-common:9.9.0")
    implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
    implementation("com.google.guava:guava:32.1.3-jre")
 }
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/BagOfWords.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/BagOfWords.kt
@ -16,7 +16,8 @@ class BagOfWords(private val data: MutableSet<String> = mutableSetOf()) : Iterab
    }
    fun join(bagOfWords: BagOfWords): BagOfWords {
-        return BagOfWords(data.toMutableSet().apply { addAll(bagOfWords.data) })
+        data.addAll(bagOfWords.data)
        return this
    }
    override fun iterator(): Iterator<String> {
--- a/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt
+++ b/libraries/textprocessing/src/main/kotlin/de/itkl/textprocessing/Histogram.kt
@ -11,6 +11,13 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
            }
        }
        fun fromBagOfWords(bagOfWords: BagOfWords): Histogram {
            val result = Histogram()
            bagOfWords.forEach(result::add)
            return result
        }
        suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
            val result = Histogram()
            flow.collect() { value ->
@ -26,6 +33,13 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
        }
    }
    fun join(other: Histogram): Histogram {
        other.forEach { (word, count) ->
            histo.merge(word, count) { a,b -> a + b }
        }
        return this
    }
    fun add(word: String) {
        histo.compute(word) { _, count ->
             count?.let { it + 1u } ?: 1u
--- a/libraries/tfidf/build.gradle.kts
+++ b/libraries/tfidf/build.gradle.kts
@ -7,4 +7,5 @@ dependencies {
    api(project(":libraries:fileprocessing"))
    implementation("com.github.ajalt.mordant:mordant:2.2.0")
    implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
    implementation("com.google.guava:guava:32.1.3-jre")
 }
--- a/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt
+++ b/libraries/tfidf/src/main/kotlin/de/itkl/tfidf/Idf.kt
@ -3,9 +3,8 @@ package de.itkl.tfidf
 import com.github.ajalt.mordant.terminal.Terminal
 import de.itkl.fileprocessing.FileProcessor
 import de.itkl.fileprocessing.Resource
-import de.itkl.processing.ParallelFlowProcessor
+import de.itkl.processing.parallelUnordered
 import de.itkl.textprocessing.*
 import de.itkl.tfidf.Idf.Companion.count
 import io.github.oshai.kotlinlogging.KotlinLogging
 import kotlinx.coroutines.*
 import kotlinx.coroutines.channels.Channel
@ -22,10 +21,6 @@ private val Log = KotlinLogging.logger { }
 class Idf : FileProcessor {
    companion object {
        val count = AtomicInteger(0)
    }
    override fun willProduce(path: Path): Path {
        return path.parent.resolve(path.nameWithoutExtension + "-idf.csv")
    }
@ -33,35 +28,20 @@ class Idf : FileProcessor {
    override suspend fun process(resource: Resource): File = coroutineScope {
        Log.info { "Would produce: ${willProduce(resource.path)}" }
        val resultFile = willProduce(resource.path).toFile()
-        val channel = Channel<List<String>>(0)
+        val histogram = TextFile(resource.read())
        launch {
            TextFile(resource.read())
            .splitByEmptyLines()
-                .collect {
+            .parallelUnordered(this, 16) { doc ->
-                    channel.send(it)
+                val result = collectWordsOfDocument(doc)
                result
            }
-        }
+            .reduce { acc, other -> acc.join(other)}
        val bagOfWords = channelFlow {
            (0..16).map {
                launch(Dispatchers.Default) {
                    channel.consumeEach {
                        val value = collectWordsOfDocument(it)
                        send(value)
                    }
                }
            }
        }
        val histogram = Histogram.fromBagOfWords(bagOfWords)
        HistogramCsvStorage().save(histogram, resultFile)
        resultFile
    }
-    private fun collectWordsOfDocument(document: List<String>): BagOfWords {
+    private fun collectWordsOfDocument(document: List<String>): Histogram {
        if (document.isEmpty()) {
-            return BagOfWords()
+            return Histogram()
        }
        val tokenizer = Tokenizer()
        val bagOfWords = document.map { line ->
@ -69,7 +49,7 @@ class Idf : FileProcessor {
            BagOfWords.from(tokens)
        }
            .reduce { acc, bagOfWords ->  acc.join(bagOfWords) }
-        return bagOfWords
+        return Histogram.fromBagOfWords(bagOfWords)
    }
 }