adding core api

Timo Bryant 2023-12-27 15:57:41 +01:00
parent 2deaa204c5
commit 6971e00221
14 changed files with 108 additions and 55 deletions

View File

@ -0,0 +1,36 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="docthor [clean]" type="GradleRunConfiguration" factoryName="Gradle" nameIsGenerated="true">
<ExternalSystemSettings>
<option name="executionName" />
<option name="externalProjectPath" value="$PROJECT_DIR$" />
<option name="externalSystemIdString" value="GRADLE" />
<option name="scriptParameters" value="" />
<option name="taskDescriptions">
<list />
</option>
<option name="taskNames">
<list>
<option value="clean" />
</list>
</option>
<option name="vmOptions" />
</ExternalSystemSettings>
<ExternalSystemDebugServerProcess>true</ExternalSystemDebugServerProcess>
<ExternalSystemReattachDebugProcess>true</ExternalSystemReattachDebugProcess>
<EXTENSION ID="com.intellij.execution.ExternalSystemRunConfigurationJavaExtension">
<extension name="net.ashald.envfile">
<option name="IS_ENABLED" value="false" />
<option name="IS_SUBST" value="false" />
<option name="IS_PATH_MACRO_SUPPORTED" value="false" />
<option name="IS_IGNORE_MISSING_FILES" value="false" />
<option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
<ENTRIES>
<ENTRY IS_ENABLED="true" PARSER="runconfig" IS_EXECUTABLE="false" />
</ENTRIES>
</extension>
</EXTENSION>
<DebugAllEnabled>false</DebugAllEnabled>
<RunAsTest>false</RunAsTest>
<method v="2" />
</configuration>
</component>

View File

@ -21,4 +21,7 @@ All libraries should be placed unter <path>libraries</path>
<def title="io">
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
</def>
<def title="core-api">
Defines the core interfaces
</def>
</deflist>

View File

@ -1,7 +1,6 @@
package de.itkl.core_api.interfaces
import java.io.File
import java.io.InputStream
import java.nio.file.Path
interface FileProcessor {

View File

@ -3,20 +3,31 @@ package de.itkl.core_api.interfaces
import io.ktor.http.*
import org.koin.core.component.KoinComponent
import org.koin.core.component.get
import java.io.File
import java.io.InputStream
import java.nio.file.Path
abstract class Resource : KoinComponent {
abstract val filename: String
abstract val contentType: ContentType
abstract val length: Long?
interface Resource : KoinComponent {
val filename: String
val contentType: ContentType
val length: Long?
val file: File?
val path: Path?
fun read(): InputStream
}
protected abstract fun doRead(): InputStream
fun read(): InputStream {
/**
* Automatically adds koin injectable decorators to reading/writing
* operations
*/
abstract class AbstractResource : Resource, KoinComponent {
abstract fun doRead(): InputStream
final override fun read(): InputStream {
return length?.let { length ->
get<ResourceReadDecorator>().decorate(
length = length,
read()
doRead()
)
} ?: read()
} ?: doRead()
}
}

View File

@ -0,0 +1,24 @@
package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.AbstractResource
import io.ktor.http.*
import java.io.File
import java.io.InputStream
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.name
class FileResource(override val path: Path) : AbstractResource() {
constructor(file: File): this(file.toPath())
override val length: Long by lazy { path.toFile().length() }
override val file: File?
get() = path.toFile()
override fun doRead(): InputStream {
return Files.newInputStream(path)
}
override val filename: String
get() = path.name
override val contentType: ContentType
get() = ContentType.fromFilePath(path.name).first()
}

View File

@ -1,5 +1,7 @@
package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.Resource
interface ProgressBarFactory {
fun new(resource: Resource): ProgressBar
fun new(name: String, max: Long): ProgressBar

View File

@ -1,22 +1,14 @@
package de.itkl.fileprocessing
import de.itkl.core_api.interfaces.AbstractResource
import de.itkl.core_api.interfaces.Resource
import io.ktor.http.*
import java.io.File
import java.io.InputStream
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.name
interface Resource {
val path: Path
val size: Long
val filename: String
fun toFile(): File = path.toFile()
fun length() = path.toFile().length()
fun read(): InputStream
}
class ProgressResource(
private val resource: Resource,
private val progressBarFactory: ProgressBarFactory
@ -29,14 +21,3 @@ class ProgressResource(
)
}
}
class FileResource(override val path: Path) : Resource {
constructor(file: File): this(file.toPath())
override val size: Long by lazy { path.toFile().length() }
override val filename: String
get() = path.name
override fun read(): InputStream {
return Files.newInputStream(path)
}
}

View File

@ -1,4 +1,5 @@
dependencies {
api(project(":libraries:core-api"))
api("org.apache.lucene:lucene-analysis-common:9.9.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")

View File

@ -2,6 +2,7 @@ package de.itkl.textprocessing
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.core_api.interfaces.Resource
import java.io.File
import java.nio.file.Path
@ -16,9 +17,9 @@ class HistogramCsvStorage {
}
}
}
suspend fun read(file: File): Histogram {
suspend fun read(resource: Resource): Histogram {
return csvReader { }
.openAsync(file) {
.openAsync(resource.read()) {
val sequence = readAllWithHeaderAsSequence()
Histogram.from(sequence)
}

View File

@ -1,6 +1,7 @@
dependencies {
api(project(":libraries:textprocessing"))
api(project(":libraries:fileprocessing"))
api(project(":libraries:core-api"))
implementation("com.github.ajalt.mordant:mordant:2.2.0")
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
implementation("com.google.guava:guava:32.1.3-jre")

View File

@ -1,7 +1,7 @@
package de.itkl.tfidf
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.Resource
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.Resource
import de.itkl.processing.parallelUnordered
import de.itkl.textprocessing.*
import de.itkl.textprocessing.interfaces.Stemmer
@ -24,8 +24,8 @@ class DocumentFrequency : FileProcessor, KoinComponent {
}
override suspend fun process(resource: Resource): File = coroutineScope {
Log.info { "Would produce: ${willProduce(resource.path)}" }
val resultFile = willProduce(resource.path).toFile()
Log.info { "Would produce: ${willProduce(resource.path!!)}" }
val resultFile = willProduce(resource.path!!).toFile()
val (numDocs, histogram) = TextFile(resource.read())
.splitByEmptyLines()
.withIndex()

View File

@ -1,43 +1,39 @@
package de.itkl.tfidf
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import de.itkl.fileprocessing.FileProcessor
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.core_api.interfaces.Resource
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
import de.itkl.textprocessing.HistogramCsvStorage
import io.github.oshai.kotlinlogging.KotlinLogging
import org.koin.core.component.KoinComponent
import org.koin.core.component.inject
import java.io.File
import java.nio.file.Path
import kotlin.io.path.nameWithoutExtension
import kotlin.math.ln
import kotlin.math.log
import kotlin.math.log10
import kotlin.math.log2
private val Log = KotlinLogging.logger { }
class InverseDocumentFrequency : FileProcessor, KoinComponent {
override fun willProduce(path: Path): Path {
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
}
override suspend fun process(resource: Resource): File {
val histogram = HistogramCsvStorage().read(resource.toFile())
val histogram = HistogramCsvStorage().read(resource)
val numDocs = histogram
.find { (word, count) -> word == "\$numDocs" }!!
.find { (word, _) -> word == "\$numDocs" }!!
.second.toInt()
val progressBarFactory: ProgressBarFactory by inject()
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progress ->
csvWriter().openAsync(willProduce(resource.path!!).toFile(), append = false) {
writeRow("word", "idf")
histogram.forEach { (word, count) ->
writeRow(word, idf(numDocs, count))
progess.step()
progress.step()
}
}
resource.path.toFile()
resource.path!!.toFile()
}
}

View File

@ -3,9 +3,9 @@ package de.itkl.tfidf
import com.github.ajalt.mordant.animation.ProgressAnimation
import com.github.ajalt.mordant.animation.progressAnimation
import com.github.ajalt.mordant.terminal.Terminal
import de.itkl.core_api.interfaces.Resource
import de.itkl.fileprocessing.ProgressBar
import de.itkl.fileprocessing.ProgressBarFactory
import de.itkl.fileprocessing.Resource
class TerminalProgressBarFactory : ProgressBarFactory {
private val terminal = Terminal()
@ -17,7 +17,7 @@ class TerminalProgressBarFactory : ProgressBarFactory {
completed()
timeRemaining()
}
return TerminalProgressBar(animation, resource.length())
return TerminalProgressBar(animation, resource.length!!)
}
override fun new(name: String, max: Long): ProgressBar {

View File

@ -1,9 +1,7 @@
package de.itkl.tfidf
import de.itkl.core_api.interfaces.FileProcessor
import de.itkl.fileprocessing.FileProcessingPipeline
import de.itkl.fileprocessing.FileProcessor
import de.itkl.fileprocessing.ProgressBarFactory
import org.koin.core.component.KoinComponent
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
override val fileProcessor = listOf<FileProcessor>(