paralleling finally works

develop
Timo Bryant 2023-12-18 21:59:15 +01:00
parent 71e066fcde
commit 13110fa8e5
4 changed files with 63 additions and 32 deletions

View File

@ -21,6 +21,7 @@ abstract class FileProcessingPipeline {
val resource = FileResource(currentFile) val resource = FileResource(currentFile)
val progress = ProgressResource(resource, progressBarFactory) val progress = ProgressResource(resource, progressBarFactory)
processor.process(progress) processor.process(progress)
Log.info { "File created: $target" }
} }
currentFile = target.toFile() currentFile = target.toFile()
} }

View File

@ -19,17 +19,19 @@ class ParallelFlowProcessor<T,U>(
} }
suspend fun process(flow: Flow<T>): Flow<U> { suspend fun process(flow: Flow<T>): Flow<U> {
return flow { TODO()
flow.map { kotlinx.coroutines.Runnable { // flow.map { }
val result = mapperFn(it) // return flow {
runBlocking { emit(result) } // flow.map { kotlinx.coroutines.Runnable {
} } // val result = mapperFn(it)
.map { job -> workers.submit(job)} // runBlocking { emit(result) }
.toList() // } }
.forEach { future -> emit(future.get() as U) } // .map { job -> workers.submit(job)}
withContext(Dispatchers.IO) { // .toList()
workers.awaitTermination(10000, TimeUnit.DAYS) // .forEach { future -> emit(future.get() as U) }
} // withContext(Dispatchers.IO) {
} // workers.awaitTermination(10000, TimeUnit.DAYS)
// }
// }
} }
} }

View File

@ -13,8 +13,7 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram { suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
val result = Histogram() val result = Histogram()
flow.collectIndexed { index, value -> flow.collect() { value ->
println(index)
value.forEach(result::add) value.forEach(result::add)
} }
return result return result

View File

@ -5,42 +5,71 @@ import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource import de.itkl.fileprocessing.Resource
import de.itkl.processing.ParallelFlowProcessor import de.itkl.processing.ParallelFlowProcessor
import de.itkl.textprocessing.* import de.itkl.textprocessing.*
import de.itkl.tfidf.Idf.Companion.count
import io.github.oshai.kotlinlogging.KotlinLogging import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.flow.map import kotlinx.coroutines.*
import kotlinx.coroutines.flow.reduce import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.flow.take import kotlinx.coroutines.channels.consumeEach
import kotlinx.coroutines.flow.*
import kotlinx.coroutines.sync.Semaphore
import kotlinx.coroutines.sync.withPermit
import java.io.File import java.io.File
import java.nio.file.Path import java.nio.file.Path
import java.util.concurrent.atomic.AtomicInteger
import kotlin.io.path.nameWithoutExtension import kotlin.io.path.nameWithoutExtension
private val Log = KotlinLogging.logger { } private val Log = KotlinLogging.logger { }
class Idf : FileProcessor { class Idf : FileProcessor {
companion object {
val count = AtomicInteger(0)
}
override fun willProduce(path: Path): Path { override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-idf.csv") return path.parent.resolve(path.nameWithoutExtension + "-idf.csv")
} }
override suspend fun process(resource: Resource): File { override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path)}" } Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile() val resultFile = willProduce(resource.path).toFile()
val textFile = TextFile(resource.read()) val channel = Channel<List<String>>(0)
val documents = textFile.splitByEmptyLines()
val bagOfWords = ParallelFlowProcessor<List<String>, BagOfWords>(
mapperFn = { document ->
val tokenizer = Tokenizer()
val bagOfWords = document.map { line ->
val tokens = tokenizer.tokenize(line)
BagOfWords.from(tokens)
}
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
bagOfWords
}
).process(documents)
launch {
TextFile(resource.read())
.splitByEmptyLines()
.collect {
channel.send(it)
}
}
val bagOfWords = channelFlow {
(0..16).map {
launch(Dispatchers.Default) {
channel.consumeEach {
val value = collectWordsOfDocument(it)
send(value)
}
}
}
}
val histogram = Histogram.fromBagOfWords(bagOfWords) val histogram = Histogram.fromBagOfWords(bagOfWords)
HistogramCsvStorage().save(histogram, resultFile) HistogramCsvStorage().save(histogram, resultFile)
return resultFile resultFile
}
private fun collectWordsOfDocument(document: List<String>): BagOfWords {
if (document.isEmpty()) {
return BagOfWords()
}
val tokenizer = Tokenizer()
val bagOfWords = document.map { line ->
val tokens = tokenizer.tokenize(line)
BagOfWords.from(tokens)
}
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
return bagOfWords
} }
} }