paralleling finally works
parent
71e066fcde
commit
13110fa8e5
|
|
@ -21,6 +21,7 @@ abstract class FileProcessingPipeline {
|
||||||
val resource = FileResource(currentFile)
|
val resource = FileResource(currentFile)
|
||||||
val progress = ProgressResource(resource, progressBarFactory)
|
val progress = ProgressResource(resource, progressBarFactory)
|
||||||
processor.process(progress)
|
processor.process(progress)
|
||||||
|
Log.info { "File created: $target" }
|
||||||
}
|
}
|
||||||
currentFile = target.toFile()
|
currentFile = target.toFile()
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -19,17 +19,19 @@ class ParallelFlowProcessor<T,U>(
|
||||||
}
|
}
|
||||||
|
|
||||||
suspend fun process(flow: Flow<T>): Flow<U> {
|
suspend fun process(flow: Flow<T>): Flow<U> {
|
||||||
return flow {
|
TODO()
|
||||||
flow.map { kotlinx.coroutines.Runnable {
|
// flow.map { }
|
||||||
val result = mapperFn(it)
|
// return flow {
|
||||||
runBlocking { emit(result) }
|
// flow.map { kotlinx.coroutines.Runnable {
|
||||||
} }
|
// val result = mapperFn(it)
|
||||||
.map { job -> workers.submit(job)}
|
// runBlocking { emit(result) }
|
||||||
.toList()
|
// } }
|
||||||
.forEach { future -> emit(future.get() as U) }
|
// .map { job -> workers.submit(job)}
|
||||||
withContext(Dispatchers.IO) {
|
// .toList()
|
||||||
workers.awaitTermination(10000, TimeUnit.DAYS)
|
// .forEach { future -> emit(future.get() as U) }
|
||||||
}
|
// withContext(Dispatchers.IO) {
|
||||||
}
|
// workers.awaitTermination(10000, TimeUnit.DAYS)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -13,8 +13,7 @@ class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : I
|
||||||
|
|
||||||
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
|
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
|
||||||
val result = Histogram()
|
val result = Histogram()
|
||||||
flow.collectIndexed { index, value ->
|
flow.collect() { value ->
|
||||||
println(index)
|
|
||||||
value.forEach(result::add)
|
value.forEach(result::add)
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
|
||||||
|
|
@ -5,42 +5,71 @@ import de.itkl.fileprocessing.FileProcessor
|
||||||
import de.itkl.fileprocessing.Resource
|
import de.itkl.fileprocessing.Resource
|
||||||
import de.itkl.processing.ParallelFlowProcessor
|
import de.itkl.processing.ParallelFlowProcessor
|
||||||
import de.itkl.textprocessing.*
|
import de.itkl.textprocessing.*
|
||||||
|
import de.itkl.tfidf.Idf.Companion.count
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||||
import kotlinx.coroutines.flow.map
|
import kotlinx.coroutines.*
|
||||||
import kotlinx.coroutines.flow.reduce
|
import kotlinx.coroutines.channels.Channel
|
||||||
import kotlinx.coroutines.flow.take
|
import kotlinx.coroutines.channels.consumeEach
|
||||||
|
import kotlinx.coroutines.flow.*
|
||||||
|
import kotlinx.coroutines.sync.Semaphore
|
||||||
|
import kotlinx.coroutines.sync.withPermit
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger
|
||||||
import kotlin.io.path.nameWithoutExtension
|
import kotlin.io.path.nameWithoutExtension
|
||||||
|
|
||||||
private val Log = KotlinLogging.logger { }
|
private val Log = KotlinLogging.logger { }
|
||||||
|
|
||||||
class Idf : FileProcessor {
|
class Idf : FileProcessor {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
val count = AtomicInteger(0)
|
||||||
|
}
|
||||||
|
|
||||||
override fun willProduce(path: Path): Path {
|
override fun willProduce(path: Path): Path {
|
||||||
return path.parent.resolve(path.nameWithoutExtension + "-idf.csv")
|
return path.parent.resolve(path.nameWithoutExtension + "-idf.csv")
|
||||||
}
|
}
|
||||||
|
|
||||||
override suspend fun process(resource: Resource): File {
|
override suspend fun process(resource: Resource): File = coroutineScope {
|
||||||
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
||||||
val resultFile = willProduce(resource.path).toFile()
|
val resultFile = willProduce(resource.path).toFile()
|
||||||
val textFile = TextFile(resource.read())
|
val channel = Channel<List<String>>(0)
|
||||||
val documents = textFile.splitByEmptyLines()
|
|
||||||
val bagOfWords = ParallelFlowProcessor<List<String>, BagOfWords>(
|
|
||||||
mapperFn = { document ->
|
|
||||||
val tokenizer = Tokenizer()
|
|
||||||
val bagOfWords = document.map { line ->
|
|
||||||
val tokens = tokenizer.tokenize(line)
|
|
||||||
BagOfWords.from(tokens)
|
|
||||||
}
|
|
||||||
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
|
|
||||||
bagOfWords
|
|
||||||
}
|
|
||||||
).process(documents)
|
|
||||||
|
|
||||||
|
launch {
|
||||||
|
TextFile(resource.read())
|
||||||
|
.splitByEmptyLines()
|
||||||
|
.collect {
|
||||||
|
channel.send(it)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
val bagOfWords = channelFlow {
|
||||||
|
(0..16).map {
|
||||||
|
launch(Dispatchers.Default) {
|
||||||
|
channel.consumeEach {
|
||||||
|
val value = collectWordsOfDocument(it)
|
||||||
|
send(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
val histogram = Histogram.fromBagOfWords(bagOfWords)
|
val histogram = Histogram.fromBagOfWords(bagOfWords)
|
||||||
HistogramCsvStorage().save(histogram, resultFile)
|
HistogramCsvStorage().save(histogram, resultFile)
|
||||||
return resultFile
|
resultFile
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun collectWordsOfDocument(document: List<String>): BagOfWords {
|
||||||
|
if (document.isEmpty()) {
|
||||||
|
return BagOfWords()
|
||||||
|
}
|
||||||
|
val tokenizer = Tokenizer()
|
||||||
|
val bagOfWords = document.map { line ->
|
||||||
|
val tokens = tokenizer.tokenize(line)
|
||||||
|
BagOfWords.from(tokens)
|
||||||
|
}
|
||||||
|
.reduce { acc, bagOfWords -> acc.join(bagOfWords) }
|
||||||
|
return bagOfWords
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue