adding core api

develop
Timo Bryant 2023-12-27 15:57:41 +01:00
parent 2deaa204c5
commit cc727c681a
16 changed files with 120 additions and 55 deletions

View File

@ -0,0 +1,36 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="docthor [clean]" type="GradleRunConfiguration" factoryName="Gradle" nameIsGenerated="true">
<ExternalSystemSettings>
<option name="executionName" />
<option name="externalProjectPath" value="$PROJECT_DIR$" />
<option name="externalSystemIdString" value="GRADLE" />
<option name="scriptParameters" value="" />
<option name="taskDescriptions">
<list />
</option>
<option name="taskNames">
<list>
<option value="clean" />
</list>
</option>
<option name="vmOptions" />
</ExternalSystemSettings>
<ExternalSystemDebugServerProcess>true</ExternalSystemDebugServerProcess>
<ExternalSystemReattachDebugProcess>true</ExternalSystemReattachDebugProcess>
<EXTENSION ID="com.intellij.execution.ExternalSystemRunConfigurationJavaExtension">
<extension name="net.ashald.envfile">
<option name="IS_ENABLED" value="false" />
<option name="IS_SUBST" value="false" />
<option name="IS_PATH_MACRO_SUPPORTED" value="false" />
<option name="IS_IGNORE_MISSING_FILES" value="false" />
<option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
<ENTRIES>
<ENTRY IS_ENABLED="true" PARSER="runconfig" IS_EXECUTABLE="false" />
</ENTRIES>
</extension>
</EXTENSION>
<DebugAllEnabled>false</DebugAllEnabled>
<RunAsTest>false</RunAsTest>
<method v="2" />
</configuration>
</component>

View File

@ -21,4 +21,7 @@ All libraries should be placed unter <path>libraries</path>
<def title="io"> <def title="io">
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp) Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
</def> </def>
<def title="core-api">
Defines the core interfaces
</def>
</deflist> </deflist>

View File

@ -6,6 +6,7 @@ import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.required import com.github.ajalt.clikt.parameters.options.required
import com.github.ajalt.clikt.parameters.types.enum import com.github.ajalt.clikt.parameters.types.enum
import com.github.ajalt.clikt.parameters.types.file import com.github.ajalt.clikt.parameters.types.file
import de.itkl.core_api.coreApiModule
import de.itkl.fileprocessing.ProgressBarFactory import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.textprocessing.textProcessingModule import de.itkl.textprocessing.textProcessingModule
import de.itkl.tfidf.Language import de.itkl.tfidf.Language
@ -33,6 +34,7 @@ class ComputeIdf : CliktCommand() {
fun main(args: Array<String>) { fun main(args: Array<String>) {
startKoin { startKoin {
modules( modules(
coreApiModule,
textProcessingModule, textProcessingModule,
module { module {
single<ProgressBarFactory> { single<ProgressBarFactory> {

View File

@ -0,0 +1,9 @@
package de.itkl.core_api
import de.itkl.core_api.interfaces.NoopResourceReadDecorator
import de.itkl.core_api.interfaces.ResourceReadDecorator
import org.koin.dsl.module
val coreApiModule = module {
single<ResourceReadDecorator> { NoopResourceReadDecorator() }
}

View File

@ -1,7 +1,6 @@
package de.itkl.core_api.interfaces package de.itkl.core_api.interfaces
import java.io.File import java.io.File
import java.io.InputStream
import java.nio.file.Path import java.nio.file.Path
interface FileProcessor { interface FileProcessor {

View File

@ -3,20 +3,32 @@ package de.itkl.core_api.interfaces
import io.ktor.http.* import io.ktor.http.*
import org.koin.core.component.KoinComponent import org.koin.core.component.KoinComponent
import org.koin.core.component.get import org.koin.core.component.get
import java.io.File
import java.io.InputStream import java.io.InputStream
import java.nio.file.Path
abstract class Resource : KoinComponent { interface Resource : KoinComponent {
abstract val filename: String val filename: String
abstract val contentType: ContentType val contentType: ContentType
abstract val length: Long? // TODO: Find a better method to avoid those nulls. Maybe subtyping the interface
val length: Long?
val file: File?
val path: Path?
fun read(): InputStream
}
protected abstract fun doRead(): InputStream /**
fun read(): InputStream { * Automatically adds koin injectable decorators to reading/writing
* operations
*/
abstract class AbstractResource : Resource, KoinComponent {
abstract fun doRead(): InputStream
final override fun read(): InputStream {
return length?.let { length -> return length?.let { length ->
get<ResourceReadDecorator>().decorate( get<ResourceReadDecorator>().decorate(
length = length, length = length,
read() doRead()
) )
} ?: read() } ?: doRead()
} }
} }

View File

@ -0,0 +1,24 @@
package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.AbstractResource
import io.ktor.http.*
import java.io.File
import java.io.InputStream
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.name
class FileResource(override val path: Path) : AbstractResource() {
constructor(file: File): this(file.toPath())
override val length: Long by lazy { path.toFile().length() }
override val file: File?
get() = path.toFile()
override fun doRead(): InputStream {
return Files.newInputStream(path)
}
override val filename: String
get() = path.name
override val contentType: ContentType
get() = ContentType.fromFilePath(path.name).first()
}

View File

@ -1,5 +1,7 @@
package de.itkl.fileprocessing package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.Resource
interface ProgressBarFactory { interface ProgressBarFactory {
fun new(resource: Resource): ProgressBar fun new(resource: Resource): ProgressBar
fun new(name: String, max: Long): ProgressBar fun new(name: String, max: Long): ProgressBar

View File

@ -1,22 +1,14 @@
package de.itkl.fileprocessing package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.AbstractResource
import de.itkl.core_api.interfaces.Resource
import io.ktor.http.*
import java.io.File import java.io.File
import java.io.InputStream import java.io.InputStream
import java.nio.file.Files import java.nio.file.Files
import java.nio.file.Path import java.nio.file.Path
import kotlin.io.path.name import kotlin.io.path.name
interface Resource {
val path: Path
val size: Long
val filename: String
fun toFile(): File = path.toFile()
fun length() = path.toFile().length()
fun read(): InputStream
}
class ProgressResource( class ProgressResource(
private val resource: Resource, private val resource: Resource,
private val progressBarFactory: ProgressBarFactory private val progressBarFactory: ProgressBarFactory
@ -29,14 +21,3 @@ class ProgressResource(
) )
} }
} }
class FileResource(override val path: Path) : Resource {
constructor(file: File): this(file.toPath())
override val size: Long by lazy { path.toFile().length() }
override val filename: String
get() = path.name
override fun read(): InputStream {
return Files.newInputStream(path)
}
}

View File

@ -1,4 +1,5 @@
dependencies { dependencies {
api(project(":libraries:core-api"))
api("org.apache.lucene:lucene-analysis-common:9.9.0") api("org.apache.lucene:lucene-analysis-common:9.9.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2") implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre") implementation("com.google.guava:guava:32.1.3-jre")

View File

@ -2,6 +2,7 @@ package de.itkl.textprocessing
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.core_api.interfaces.Resource
import java.io.File import java.io.File
import java.nio.file.Path import java.nio.file.Path
@ -16,9 +17,9 @@ class HistogramCsvStorage {
} }
} }
} }
suspend fun read(file: File): Histogram { suspend fun read(resource: Resource): Histogram {
return csvReader { } return csvReader { }
.openAsync(file) { .openAsync(resource.read()) {
val sequence = readAllWithHeaderAsSequence() val sequence = readAllWithHeaderAsSequence()
Histogram.from(sequence) Histogram.from(sequence)
} }

View File

@ -1,6 +1,7 @@
dependencies { dependencies {
api(project(":libraries:textprocessing")) api(project(":libraries:textprocessing"))
api(project(":libraries:fileprocessing")) api(project(":libraries:fileprocessing"))
api(project(":libraries:core-api"))
implementation("com.github.ajalt.mordant:mordant:2.2.0") implementation("com.github.ajalt.mordant:mordant:2.2.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2") implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre") implementation("com.google.guava:guava:32.1.3-jre")

View File

@ -1,7 +1,7 @@
package de.itkl.tfidf package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessor import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.fileprocessing.Resource import de.itkl.core_api.interfaces.Resource
import de.itkl.processing.parallelUnordered import de.itkl.processing.parallelUnordered
import de.itkl.textprocessing.* import de.itkl.textprocessing.*
import de.itkl.textprocessing.interfaces.Stemmer import de.itkl.textprocessing.interfaces.Stemmer
@ -24,8 +24,8 @@ class DocumentFrequency : FileProcessor, KoinComponent {
} }
override suspend fun process(resource: Resource): File = coroutineScope { override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path)}" } Log.info { "Would produce: ${willProduce(resource.path!!)}" }
val resultFile = willProduce(resource.path).toFile() val resultFile = willProduce(resource.path!!).toFile()
val (numDocs, histogram) = TextFile(resource.read()) val (numDocs, histogram) = TextFile(resource.read())
.splitByEmptyLines() .splitByEmptyLines()
.withIndex() .withIndex()

View File

@ -1,43 +1,39 @@
package de.itkl.tfidf package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.fileprocessing.FileProcessor import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.Resource
import de.itkl.fileprocessing.ProgressBarFactory import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.textprocessing.HistogramCsvStorage import de.itkl.textprocessing.HistogramCsvStorage
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent import org.koin.core.component.KoinComponent
import org.koin.core.component.inject import org.koin.core.component.inject
import java.io.File import java.io.File
import java.nio.file.Path import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension import kotlin.io.path.nameWithoutExtension
import kotlin.math.ln
import kotlin.math.log
import kotlin.math.log10 import kotlin.math.log10
import kotlin.math.log2
private val Log = KotlinLogging.logger { }
class InverseDocumentFrequency : FileProcessor, KoinComponent { class InverseDocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path { override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv") return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
} }
override suspend fun process(resource: Resource): File { override suspend fun process(resource: Resource): File {
val histogram = HistogramCsvStorage().read(resource.toFile()) val histogram = HistogramCsvStorage().read(resource)
val numDocs = histogram val numDocs = histogram
.find { (word, count) -> word == "\$numDocs" }!! .find { (word, _) -> word == "\$numDocs" }!!
.second.toInt() .second.toInt()
val progressBarFactory: ProgressBarFactory by inject() val progressBarFactory: ProgressBarFactory by inject()
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess -> return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progress ->
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) { csvWriter().openAsync(willProduce(resource.path!!).toFile(), append = false) {
writeRow("word", "idf") writeRow("word", "idf")
histogram.forEach { (word, count) -> histogram.forEach { (word, count) ->
writeRow(word, idf(numDocs, count)) writeRow(word, idf(numDocs, count))
progess.step() progress.step()
} }
} }
resource.path.toFile() resource.path!!.toFile()
} }
} }

View File

@ -3,9 +3,9 @@ package de.itkl.tfidf
import com.github.ajalt.mordant.animation.ProgressAnimation import com.github.ajalt.mordant.animation.ProgressAnimation
import com.github.ajalt.mordant.animation.progressAnimation import com.github.ajalt.mordant.animation.progressAnimation
import com.github.ajalt.mordant.terminal.Terminal import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.core_api.interfaces.Resource
import de.itkl.fileprocessing.ProgressBar import de.itkl.fileprocessing.ProgressBar
import de.itkl.fileprocessing.ProgressBarFactory import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
class TerminalProgressBarFactory : ProgressBarFactory { class TerminalProgressBarFactory : ProgressBarFactory {
private val terminal = Terminal() private val terminal = Terminal()
@ -17,7 +17,7 @@ class TerminalProgressBarFactory : ProgressBarFactory {
completed() completed()
timeRemaining() timeRemaining()
} }
return TerminalProgressBar(animation, resource.length()) return TerminalProgressBar(animation, resource.length!!)
} }
override fun new(name: String, max: Long): ProgressBar { override fun new(name: String, max: Long): ProgressBar {

View File

@ -1,9 +1,7 @@
package de.itkl.tfidf package de.itkl.tfidf
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.fileprocessing.FileProcessingPipeline import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) { class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
override val fileProcessor = listOf<FileProcessor>( override val fileProcessor = listOf<FileProcessor>(