Compare commits
No commits in common. "9f3813a83abe26d9a0da9f3402ae0a233b100597" and "2deaa204c5ef3d10db58047ccf7c1c2121b29fdd" have entirely different histories.
9f3813a83a
...
2deaa204c5
|
|
@ -1,36 +0,0 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="docthor [clean]" type="GradleRunConfiguration" factoryName="Gradle" nameIsGenerated="true">
|
||||
<ExternalSystemSettings>
|
||||
<option name="executionName" />
|
||||
<option name="externalProjectPath" value="$PROJECT_DIR$" />
|
||||
<option name="externalSystemIdString" value="GRADLE" />
|
||||
<option name="scriptParameters" value="" />
|
||||
<option name="taskDescriptions">
|
||||
<list />
|
||||
</option>
|
||||
<option name="taskNames">
|
||||
<list>
|
||||
<option value="clean" />
|
||||
</list>
|
||||
</option>
|
||||
<option name="vmOptions" />
|
||||
</ExternalSystemSettings>
|
||||
<ExternalSystemDebugServerProcess>true</ExternalSystemDebugServerProcess>
|
||||
<ExternalSystemReattachDebugProcess>true</ExternalSystemReattachDebugProcess>
|
||||
<EXTENSION ID="com.intellij.execution.ExternalSystemRunConfigurationJavaExtension">
|
||||
<extension name="net.ashald.envfile">
|
||||
<option name="IS_ENABLED" value="false" />
|
||||
<option name="IS_SUBST" value="false" />
|
||||
<option name="IS_PATH_MACRO_SUPPORTED" value="false" />
|
||||
<option name="IS_IGNORE_MISSING_FILES" value="false" />
|
||||
<option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
|
||||
<ENTRIES>
|
||||
<ENTRY IS_ENABLED="true" PARSER="runconfig" IS_EXECUTABLE="false" />
|
||||
</ENTRIES>
|
||||
</extension>
|
||||
</EXTENSION>
|
||||
<DebugAllEnabled>false</DebugAllEnabled>
|
||||
<RunAsTest>false</RunAsTest>
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
||||
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
<instance-profile id="d"
|
||||
name="Docthor"
|
||||
start-page="docthor.md">
|
||||
start-page="starter-topic.md">
|
||||
|
||||
<toc-element topic="docthor.md"/>
|
||||
<toc-element topic="starter-topic.md"/>
|
||||
</instance-profile>
|
||||
|
|
@ -21,12 +21,4 @@ All libraries should be placed unter <path>libraries</path>
|
|||
<def title="io">
|
||||
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
|
||||
</def>
|
||||
<def title="core-api">
|
||||
Defines the core interfaces
|
||||
</def>
|
||||
<def title="tui">
|
||||
Provides tui capabilities. When applied as koin modules
|
||||
the resources will automatically print a read/write progressbar
|
||||
on terminal.
|
||||
</def>
|
||||
</deflist>
|
||||
|
|
@ -4,7 +4,6 @@ plugins {
|
|||
|
||||
dependencies {
|
||||
implementation(project(":libraries:tfidf"))
|
||||
implementation(project(":libraries:tui"))
|
||||
}
|
||||
|
||||
application {
|
||||
|
|
|
|||
|
|
@ -6,13 +6,15 @@ import com.github.ajalt.clikt.parameters.options.option
|
|||
import com.github.ajalt.clikt.parameters.options.required
|
||||
import com.github.ajalt.clikt.parameters.types.enum
|
||||
import com.github.ajalt.clikt.parameters.types.file
|
||||
import de.itkl.core_api.coreApiModule
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import de.itkl.textprocessing.textProcessingModule
|
||||
import de.itkl.tfidf.Language
|
||||
import de.itkl.tfidf.TerminalProgressBarFactory
|
||||
//import de.itkl.tfidf.TfIdf
|
||||
import de.itkl.tfidf.TfIdfPipeline
|
||||
import de.itkl.tui.tuiModule
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import org.koin.core.context.startKoin
|
||||
import org.koin.dsl.module
|
||||
|
||||
class ComputeIdf : CliktCommand() {
|
||||
private val corpus by option(help = "corpus")
|
||||
|
|
@ -31,9 +33,12 @@ class ComputeIdf : CliktCommand() {
|
|||
fun main(args: Array<String>) {
|
||||
startKoin {
|
||||
modules(
|
||||
coreApiModule,
|
||||
textProcessingModule,
|
||||
tuiModule)
|
||||
module {
|
||||
single<ProgressBarFactory> {
|
||||
TerminalProgressBarFactory()
|
||||
}
|
||||
})
|
||||
ComputeIdf().main(args)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,5 +7,5 @@ repositories {
|
|||
}
|
||||
|
||||
dependencies {
|
||||
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:$embeddedKotlinVersion")
|
||||
implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.20")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,11 +13,6 @@ dependencies {
|
|||
val koin_version = "3.5.3"
|
||||
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3")
|
||||
implementation("io.insert-koin:koin-core:$koin_version")
|
||||
implementation("org.jetbrains.kotlinx:kotlinx-datetime:0.5.0")
|
||||
implementation("org.jetbrains.kotlinx:kotlinx-serialization-json:1.6.2")
|
||||
|
||||
|
||||
testImplementation("io.insert-koin:koin-test:$koin_version")
|
||||
}
|
||||
|
||||
java {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,6 @@
|
|||
package de.itkl.clients
|
||||
|
||||
class MsOcr {
|
||||
|
||||
suspend fun ocr() {}
|
||||
}
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
package de.itkl.core_api
|
||||
|
||||
import de.itkl.core_api.interfaces.NoopResourceReadDecorator
|
||||
import de.itkl.core_api.interfaces.ResourceFactory
|
||||
import de.itkl.core_api.interfaces.ResourceReadDecorator
|
||||
import org.koin.dsl.module
|
||||
|
||||
val coreApiModule = module {
|
||||
single<ResourceFactory> { ResourceFactory()}
|
||||
single<ResourceReadDecorator> { NoopResourceReadDecorator() }
|
||||
}
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
package de.itkl.core_api.implementation
|
||||
|
||||
import de.itkl.core_api.interfaces.AbstractResource
|
||||
import io.ktor.http.*
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.name
|
||||
|
||||
class FileResource(override val path: Path) : AbstractResource() {
|
||||
constructor(file: File): this(file.toPath())
|
||||
override val length: Long by lazy { path.toFile().length() }
|
||||
override val file: File?
|
||||
get() = path.toFile()
|
||||
|
||||
override fun doRead(): InputStream {
|
||||
return Files.newInputStream(path)
|
||||
}
|
||||
override val filename: String
|
||||
get() = path.name
|
||||
override val contentType: ContentType
|
||||
get() = ContentType.fromFilePath(path.name).first()
|
||||
}
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
package de.itkl.core_api.implementation
|
||||
|
||||
import de.itkl.core_api.implementation.ProgressInputStream
|
||||
import de.itkl.core_api.interfaces.ProgressBarFactory
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import java.io.InputStream
|
||||
|
||||
internal class ProgressResource(
|
||||
private val resource: Resource,
|
||||
private val progressBarFactory: ProgressBarFactory
|
||||
) : Resource by resource
|
||||
{
|
||||
override fun read(): InputStream {
|
||||
return ProgressInputStream(
|
||||
resource.read(),
|
||||
progressBarFactory.new(this)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
package de.itkl.core_api.interfaces
|
||||
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Path
|
||||
|
||||
interface FileProcessor {
|
||||
|
|
|
|||
|
|
@ -3,32 +3,20 @@ package de.itkl.core_api.interfaces
|
|||
import io.ktor.http.*
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.get
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Path
|
||||
|
||||
interface Resource {
|
||||
val filename: String
|
||||
val contentType: ContentType
|
||||
// TODO: Find a better method to avoid those nulls. Maybe subtyping the interface
|
||||
val length: Long?
|
||||
val file: File?
|
||||
val path: Path?
|
||||
fun read(): InputStream
|
||||
}
|
||||
abstract class Resource : KoinComponent {
|
||||
abstract val filename: String
|
||||
abstract val contentType: ContentType
|
||||
abstract val length: Long?
|
||||
|
||||
/**
|
||||
* Automatically adds koin injectable decorators to reading/writing
|
||||
* operations
|
||||
*/
|
||||
abstract class AbstractResource : Resource, KoinComponent {
|
||||
abstract fun doRead(): InputStream
|
||||
final override fun read(): InputStream {
|
||||
protected abstract fun doRead(): InputStream
|
||||
fun read(): InputStream {
|
||||
return length?.let { length ->
|
||||
get<ResourceReadDecorator>().decorate(
|
||||
length = length,
|
||||
doRead()
|
||||
read()
|
||||
)
|
||||
} ?: doRead()
|
||||
} ?: read()
|
||||
}
|
||||
}
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
package de.itkl.core_api.interfaces
|
||||
|
||||
import de.itkl.core_api.implementation.FileResource
|
||||
import de.itkl.core_api.implementation.ProgressResource
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
import java.io.File
|
||||
|
||||
class ResourceFactory : KoinComponent {
|
||||
|
||||
private val progressBarFactory by inject<ProgressBarFactory>()
|
||||
fun file(file: File): Resource {
|
||||
val resource = FileResource(file)
|
||||
return ProgressResource(resource, progressBarFactory)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
package de.itkl.core_api.interfaces.data
|
||||
interface DataTable : Iterable<List<String>> {
|
||||
val columns: List<String>
|
||||
}
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
package de.itkl.fileprocessing
|
||||
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.ResourceFactory
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
|
|
@ -11,9 +10,10 @@ import kotlin.io.path.exists
|
|||
private val Log = KotlinLogging.logger { }
|
||||
|
||||
abstract class FileProcessingPipeline(private val force: Boolean = false) : KoinComponent {
|
||||
private val resourceFactory: ResourceFactory by inject()
|
||||
|
||||
|
||||
protected abstract val fileProcessor: List<FileProcessor>
|
||||
private val progressBarFactory: ProgressBarFactory by inject()
|
||||
suspend fun input(file: File) {
|
||||
var currentFile = file
|
||||
fileProcessor.forEach { processor ->
|
||||
|
|
@ -22,8 +22,9 @@ abstract class FileProcessingPipeline(private val force: Boolean = false) : Koin
|
|||
Log.info { "$target exists. Skipping" }
|
||||
} else {
|
||||
Log.info { "$target does not exists. Creating" }
|
||||
val resource = resourceFactory.file(currentFile)
|
||||
processor.process(resource)
|
||||
val resource = FileResource(currentFile)
|
||||
val progress = ProgressResource(resource, progressBarFactory)
|
||||
processor.process(progress)
|
||||
Log.info { "File created: $target" }
|
||||
}
|
||||
currentFile = target.toFile()
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
package de.itkl.core_api.interfaces
|
||||
package de.itkl.fileprocessing
|
||||
|
||||
interface ProgressBarFactory {
|
||||
fun new(resource: Resource): ProgressBar
|
||||
|
|
@ -1,6 +1,5 @@
|
|||
package de.itkl.core_api.implementation
|
||||
package de.itkl.fileprocessing
|
||||
|
||||
import de.itkl.core_api.interfaces.ProgressBar
|
||||
import java.io.InputStream
|
||||
|
||||
/**
|
||||
|
|
@ -10,10 +9,9 @@ import java.io.InputStream
|
|||
* @property updateOp The operation to be executed when the number of bytes read changes.
|
||||
* @property bytesRead The number of bytes read from the input stream.
|
||||
*/
|
||||
internal class ProgressInputStream(
|
||||
class ProgressInputStream(
|
||||
private val inputStream: InputStream,
|
||||
private val progressBar: ProgressBar
|
||||
) : InputStream() {
|
||||
private val progressBar: ProgressBar) : InputStream() {
|
||||
@Volatile
|
||||
var bytesRead: Long = 0
|
||||
private set(value) {
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
package de.itkl.fileprocessing
|
||||
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.name
|
||||
|
||||
interface Resource {
|
||||
val path: Path
|
||||
val size: Long
|
||||
val filename: String
|
||||
fun toFile(): File = path.toFile()
|
||||
|
||||
fun length() = path.toFile().length()
|
||||
|
||||
fun read(): InputStream
|
||||
}
|
||||
|
||||
class ProgressResource(
|
||||
private val resource: Resource,
|
||||
private val progressBarFactory: ProgressBarFactory
|
||||
) : Resource by resource
|
||||
{
|
||||
override fun read(): InputStream {
|
||||
return ProgressInputStream(
|
||||
resource.read(),
|
||||
progressBarFactory.new(this)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
class FileResource(override val path: Path) : Resource {
|
||||
constructor(file: File): this(file.toPath())
|
||||
override val size: Long by lazy { path.toFile().length() }
|
||||
override val filename: String
|
||||
get() = path.name
|
||||
|
||||
override fun read(): InputStream {
|
||||
return Files.newInputStream(path)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
plugins {
|
||||
kotlin("plugin.serialization") version embeddedKotlinVersion
|
||||
}
|
||||
|
||||
val ktorVersion: String by project
|
||||
|
||||
dependencies {
|
||||
api(project(":libraries:core-api"))
|
||||
|
||||
api("io.ktor:ktor-client-core:$ktorVersion")
|
||||
api("io.ktor:ktor-client-core-jvm:$ktorVersion")
|
||||
implementation("io.ktor:ktor-client-cio:$ktorVersion")
|
||||
implementation("io.ktor:ktor-client-content-negotiation:$ktorVersion")
|
||||
implementation("io.ktor:ktor-serialization-kotlinx-json:$ktorVersion")
|
||||
}
|
||||
|
|
@ -1 +0,0 @@
|
|||
ktorVersion=2.3.7
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
package de.itkl.httpClient.clients
|
||||
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import io.ktor.client.*
|
||||
import io.ktor.client.call.*
|
||||
import io.ktor.client.request.*
|
||||
import io.ktor.client.statement.*
|
||||
import io.ktor.http.*
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
class MsOcr: KoinComponent {
|
||||
private val httpClient: HttpClient by inject()
|
||||
|
||||
suspend fun ocr(resource: Resource): MsOcrResponse {
|
||||
val response = httpClient.post {
|
||||
url("http://10.54.150.152:5000/vision/v3.2/read/syncAnalyze")
|
||||
parameters {
|
||||
append("language", "de")
|
||||
append("readingOrder", "natural")
|
||||
}
|
||||
contentType(resource.contentType)
|
||||
setBody(resource.read())
|
||||
}
|
||||
println("got response: ${response.status} in ${response.responseTime}")
|
||||
return response.body()
|
||||
}
|
||||
}
|
||||
|
|
@ -1,81 +0,0 @@
|
|||
package de.itkl.httpClient.clients
|
||||
|
||||
|
||||
import kotlinx.datetime.Instant
|
||||
import kotlinx.datetime.LocalDateTime
|
||||
import kotlinx.serialization.SerialName
|
||||
import kotlinx.serialization.Serializable
|
||||
|
||||
@Serializable
|
||||
data class MsOcrResponse(
|
||||
@SerialName("analyzeResult")
|
||||
val analyzeResult: AnalyzeResult,
|
||||
@SerialName("createdDateTime")
|
||||
val createdDateTime: Instant, // 2023-12-29T21:02:30Z
|
||||
@SerialName("lastUpdatedDateTime")
|
||||
val lastUpdatedDateTime: Instant, // 2023-12-29T21:02:31Z
|
||||
@SerialName("status")
|
||||
val status: String // succeeded
|
||||
) {
|
||||
@Serializable
|
||||
data class AnalyzeResult(
|
||||
@SerialName("modelVersion")
|
||||
val modelVersion: String, // 2022-04-30
|
||||
@SerialName("readResults")
|
||||
val readResults: List<ReadResult>,
|
||||
@SerialName("version")
|
||||
val version: String // 3.2.0
|
||||
) {
|
||||
@Serializable
|
||||
data class ReadResult(
|
||||
@SerialName("angle")
|
||||
val angle: Int, // 0
|
||||
@SerialName("height")
|
||||
val height: Int, // 3507
|
||||
@SerialName("lines")
|
||||
val lines: List<Line>,
|
||||
@SerialName("page")
|
||||
val page: Int, // 1
|
||||
@SerialName("unit")
|
||||
val unit: String, // pixel
|
||||
@SerialName("width")
|
||||
val width: Int // 2481
|
||||
) {
|
||||
@Serializable
|
||||
data class Line(
|
||||
@SerialName("appearance")
|
||||
val appearance: Appearance,
|
||||
@SerialName("boundingBox")
|
||||
val boundingBox: List<Int>,
|
||||
@SerialName("text")
|
||||
val text: String, // Franz Mustermann
|
||||
@SerialName("words")
|
||||
val words: List<Word>
|
||||
) {
|
||||
@Serializable
|
||||
data class Appearance(
|
||||
@SerialName("style")
|
||||
val style: Style
|
||||
) {
|
||||
@Serializable
|
||||
data class Style(
|
||||
@SerialName("confidence")
|
||||
val confidence: Double, // 0.972
|
||||
@SerialName("name")
|
||||
val name: String // other
|
||||
)
|
||||
}
|
||||
|
||||
@Serializable
|
||||
data class Word(
|
||||
@SerialName("boundingBox")
|
||||
val boundingBox: List<Int>,
|
||||
@SerialName("confidence")
|
||||
val confidence: Double, // 0.998
|
||||
@SerialName("text")
|
||||
val text: String // Franz
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
package de.itkl.httpClient
|
||||
|
||||
import io.ktor.client.*
|
||||
import io.ktor.client.engine.cio.*
|
||||
import io.ktor.client.plugins.contentnegotiation.*
|
||||
import io.ktor.serialization.kotlinx.json.*
|
||||
|
||||
fun createHttpClient(): HttpClient {
|
||||
return HttpClient(CIO) {
|
||||
install(ContentNegotiation) {
|
||||
json()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
package de.itkl.httpClient
|
||||
|
||||
import de.itkl.httpClient.clients.MsOcr
|
||||
import io.ktor.client.*
|
||||
import org.koin.dsl.module
|
||||
|
||||
val httpClientModule = module {
|
||||
single<HttpClient> { createHttpClient() }
|
||||
single<MsOcr> { MsOcr() }
|
||||
}
|
||||
|
|
@ -1,36 +0,0 @@
|
|||
package de.itkl.httpClient.clients
|
||||
|
||||
import de.itkl.core_api.coreApiModule
|
||||
import de.itkl.core_api.implementation.FileResource
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.httpClient.httpClientModule
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import org.junit.Rule
|
||||
import org.junit.jupiter.api.BeforeEach
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.koin.core.component.inject
|
||||
import org.koin.core.context.startKoin
|
||||
import org.koin.test.KoinTest
|
||||
import java.nio.file.Paths
|
||||
|
||||
class MsOcrTest : KoinTest {
|
||||
|
||||
@BeforeEach
|
||||
fun start() {
|
||||
startKoin {
|
||||
printLogger()
|
||||
modules(
|
||||
coreApiModule,
|
||||
httpClientModule)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `can create a request`() = runBlocking {
|
||||
val msOcrClient: MsOcr by inject()
|
||||
val resource = FileResource(Paths.get("../../assets/xs-reg/00001.jpg").toAbsolutePath())
|
||||
val response = msOcrClient.ocr(resource)
|
||||
println(response)
|
||||
Unit
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,4 @@
|
|||
dependencies {
|
||||
api(project(":libraries:core-api"))
|
||||
api("org.apache.lucene:lucene-analysis-common:9.9.0")
|
||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||
implementation("com.google.guava:guava:32.1.3-jre")
|
||||
|
|
|
|||
|
|
@ -2,16 +2,30 @@ package de.itkl.textprocessing
|
|||
|
||||
import kotlinx.coroutines.flow.*
|
||||
|
||||
class Histogram(
|
||||
private val histo: MutableMap<String,UInt> = mutableMapOf()
|
||||
) : Iterable<Pair<String, UInt>>{
|
||||
class Histogram(private val histo: MutableMap<String,UInt> = mutableMapOf()) : Iterable<Pair<String, UInt>>{
|
||||
|
||||
companion object {
|
||||
suspend fun from(flow: Flow<String>): Histogram {
|
||||
return Histogram().apply {
|
||||
flow.collect(this::add)
|
||||
}
|
||||
}
|
||||
|
||||
fun fromBagOfWords(bagOfWords: BagOfWords): Histogram {
|
||||
val result = Histogram()
|
||||
bagOfWords.forEach(result::add)
|
||||
return result
|
||||
}
|
||||
|
||||
|
||||
suspend fun fromBagOfWords(flow: Flow<BagOfWords>): Histogram {
|
||||
val result = Histogram()
|
||||
flow.collect() { value ->
|
||||
value.forEach(result::add)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
fun from(sequence: Sequence<Map<String, String>>): Histogram {
|
||||
val histo = sequence.associate { map -> map["word"]!! to map["count"]!!.toUInt() }
|
||||
.toMutableMap()
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package de.itkl.textprocessing
|
|||
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
|
||||
|
|
@ -17,9 +16,9 @@ class HistogramCsvStorage {
|
|||
}
|
||||
}
|
||||
}
|
||||
suspend fun read(resource: Resource): Histogram {
|
||||
suspend fun read(file: File): Histogram {
|
||||
return csvReader { }
|
||||
.openAsync(resource.read()) {
|
||||
.openAsync(file) {
|
||||
val sequence = readAllWithHeaderAsSequence()
|
||||
Histogram.from(sequence)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
dependencies {
|
||||
api(project(":libraries:textprocessing"))
|
||||
api(project(":libraries:fileprocessing"))
|
||||
api(project(":libraries:core-api"))
|
||||
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||
implementation("com.google.guava:guava:32.1.3-jre")
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.fileprocessing.FileProcessor
|
||||
import de.itkl.fileprocessing.Resource
|
||||
import de.itkl.processing.parallelUnordered
|
||||
import de.itkl.textprocessing.*
|
||||
import de.itkl.textprocessing.interfaces.Stemmer
|
||||
|
|
@ -24,8 +24,8 @@ class DocumentFrequency : FileProcessor, KoinComponent {
|
|||
}
|
||||
|
||||
override suspend fun process(resource: Resource): File = coroutineScope {
|
||||
Log.info { "Would produce: ${willProduce(resource.path!!)}" }
|
||||
val resultFile = willProduce(resource.path!!).toFile()
|
||||
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
||||
val resultFile = willProduce(resource.path).toFile()
|
||||
val (numDocs, histogram) = TextFile(resource.read())
|
||||
.splitByEmptyLines()
|
||||
.withIndex()
|
||||
|
|
|
|||
|
|
@ -1,39 +1,43 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.core_api.interfaces.ProgressBarFactory
|
||||
import de.itkl.fileprocessing.FileProcessor
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import de.itkl.fileprocessing.Resource
|
||||
import de.itkl.textprocessing.HistogramCsvStorage
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.nameWithoutExtension
|
||||
import kotlin.math.ln
|
||||
import kotlin.math.log
|
||||
import kotlin.math.log10
|
||||
import kotlin.math.log2
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
|
||||
class InverseDocumentFrequency : FileProcessor, KoinComponent {
|
||||
override fun willProduce(path: Path): Path {
|
||||
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
|
||||
}
|
||||
|
||||
|
||||
override suspend fun process(resource: Resource): File {
|
||||
val histogram = HistogramCsvStorage().read(resource)
|
||||
val histogram = HistogramCsvStorage().read(resource.toFile())
|
||||
val numDocs = histogram
|
||||
.find { (word, _) -> word == "\$numDocs" }!!
|
||||
.find { (word, count) -> word == "\$numDocs" }!!
|
||||
.second.toInt()
|
||||
val progressBarFactory: ProgressBarFactory by inject()
|
||||
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progress ->
|
||||
csvWriter().openAsync(willProduce(resource.path!!).toFile(), append = false) {
|
||||
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
|
||||
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
|
||||
writeRow("word", "idf")
|
||||
histogram.forEach { (word, count) ->
|
||||
writeRow(word, idf(numDocs, count))
|
||||
progress.step()
|
||||
progess.step()
|
||||
}
|
||||
}
|
||||
resource.path!!.toFile()
|
||||
resource.path.toFile()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
package de.itkl.tui.implementation
|
||||
package de.itkl.tfidf
|
||||
|
||||
import com.github.ajalt.mordant.animation.ProgressAnimation
|
||||
import com.github.ajalt.mordant.animation.progressAnimation
|
||||
import com.github.ajalt.mordant.terminal.Terminal
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.core_api.interfaces.ProgressBar
|
||||
import de.itkl.core_api.interfaces.ProgressBarFactory
|
||||
import de.itkl.fileprocessing.ProgressBar
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import de.itkl.fileprocessing.Resource
|
||||
|
||||
class TerminalProgressBarFactory : ProgressBarFactory {
|
||||
private val terminal = Terminal()
|
||||
|
|
@ -17,7 +17,7 @@ class TerminalProgressBarFactory : ProgressBarFactory {
|
|||
completed()
|
||||
timeRemaining()
|
||||
}
|
||||
return TerminalProgressBar(animation, resource.length!!)
|
||||
return TerminalProgressBar(animation, resource.length())
|
||||
}
|
||||
|
||||
override fun new(name: String, max: Long): ProgressBar {
|
||||
|
|
@ -1,7 +1,9 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.fileprocessing.FileProcessingPipeline
|
||||
import de.itkl.fileprocessing.FileProcessor
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import org.koin.core.component.KoinComponent
|
||||
|
||||
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
||||
override val fileProcessor = listOf<FileProcessor>(
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
dependencies {
|
||||
api(project(":libraries:core-api"))
|
||||
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
||||
}
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
package de.itkl.tui.implementation
|
||||
|
||||
class TerminalDataTableReporter {
|
||||
}
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
package de.itkl.tui
|
||||
|
||||
import de.itkl.core_api.interfaces.ProgressBarFactory
|
||||
import de.itkl.tui.implementation.TerminalProgressBarFactory
|
||||
import org.koin.dsl.module
|
||||
|
||||
/**
|
||||
* Add terminal ui capabilities
|
||||
*/
|
||||
val tuiModule = module {
|
||||
single<ProgressBarFactory> {
|
||||
TerminalProgressBarFactory()
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue