adding core api
parent
2deaa204c5
commit
cc727c681a
|
|
@ -0,0 +1,36 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="docthor [clean]" type="GradleRunConfiguration" factoryName="Gradle" nameIsGenerated="true">
|
||||
<ExternalSystemSettings>
|
||||
<option name="executionName" />
|
||||
<option name="externalProjectPath" value="$PROJECT_DIR$" />
|
||||
<option name="externalSystemIdString" value="GRADLE" />
|
||||
<option name="scriptParameters" value="" />
|
||||
<option name="taskDescriptions">
|
||||
<list />
|
||||
</option>
|
||||
<option name="taskNames">
|
||||
<list>
|
||||
<option value="clean" />
|
||||
</list>
|
||||
</option>
|
||||
<option name="vmOptions" />
|
||||
</ExternalSystemSettings>
|
||||
<ExternalSystemDebugServerProcess>true</ExternalSystemDebugServerProcess>
|
||||
<ExternalSystemReattachDebugProcess>true</ExternalSystemReattachDebugProcess>
|
||||
<EXTENSION ID="com.intellij.execution.ExternalSystemRunConfigurationJavaExtension">
|
||||
<extension name="net.ashald.envfile">
|
||||
<option name="IS_ENABLED" value="false" />
|
||||
<option name="IS_SUBST" value="false" />
|
||||
<option name="IS_PATH_MACRO_SUPPORTED" value="false" />
|
||||
<option name="IS_IGNORE_MISSING_FILES" value="false" />
|
||||
<option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
|
||||
<ENTRIES>
|
||||
<ENTRY IS_ENABLED="true" PARSER="runconfig" IS_EXECUTABLE="false" />
|
||||
</ENTRIES>
|
||||
</extension>
|
||||
</EXTENSION>
|
||||
<DebugAllEnabled>false</DebugAllEnabled>
|
||||
<RunAsTest>false</RunAsTest>
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
||||
|
|
@ -21,4 +21,7 @@ All libraries should be placed unter <path>libraries</path>
|
|||
<def title="io">
|
||||
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
|
||||
</def>
|
||||
<def title="core-api">
|
||||
Defines the core interfaces
|
||||
</def>
|
||||
</deflist>
|
||||
|
|
@ -6,6 +6,7 @@ import com.github.ajalt.clikt.parameters.options.option
|
|||
import com.github.ajalt.clikt.parameters.options.required
|
||||
import com.github.ajalt.clikt.parameters.types.enum
|
||||
import com.github.ajalt.clikt.parameters.types.file
|
||||
import de.itkl.core_api.coreApiModule
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import de.itkl.textprocessing.textProcessingModule
|
||||
import de.itkl.tfidf.Language
|
||||
|
|
@ -33,6 +34,7 @@ class ComputeIdf : CliktCommand() {
|
|||
fun main(args: Array<String>) {
|
||||
startKoin {
|
||||
modules(
|
||||
coreApiModule,
|
||||
textProcessingModule,
|
||||
module {
|
||||
single<ProgressBarFactory> {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,9 @@
|
|||
package de.itkl.core_api
|
||||
|
||||
import de.itkl.core_api.interfaces.NoopResourceReadDecorator
|
||||
import de.itkl.core_api.interfaces.ResourceReadDecorator
|
||||
import org.koin.dsl.module
|
||||
|
||||
val coreApiModule = module {
|
||||
single<ResourceReadDecorator> { NoopResourceReadDecorator() }
|
||||
}
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
package de.itkl.core_api.interfaces
|
||||
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Path
|
||||
|
||||
interface FileProcessor {
|
||||
|
|
|
|||
|
|
@ -3,20 +3,32 @@ package de.itkl.core_api.interfaces
|
|||
import io.ktor.http.*
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.get
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Path
|
||||
|
||||
abstract class Resource : KoinComponent {
|
||||
abstract val filename: String
|
||||
abstract val contentType: ContentType
|
||||
abstract val length: Long?
|
||||
interface Resource : KoinComponent {
|
||||
val filename: String
|
||||
val contentType: ContentType
|
||||
// TODO: Find a better method to avoid those nulls. Maybe subtyping the interface
|
||||
val length: Long?
|
||||
val file: File?
|
||||
val path: Path?
|
||||
fun read(): InputStream
|
||||
}
|
||||
|
||||
protected abstract fun doRead(): InputStream
|
||||
fun read(): InputStream {
|
||||
/**
|
||||
* Automatically adds koin injectable decorators to reading/writing
|
||||
* operations
|
||||
*/
|
||||
abstract class AbstractResource : Resource, KoinComponent {
|
||||
abstract fun doRead(): InputStream
|
||||
final override fun read(): InputStream {
|
||||
return length?.let { length ->
|
||||
get<ResourceReadDecorator>().decorate(
|
||||
length = length,
|
||||
read()
|
||||
doRead()
|
||||
)
|
||||
} ?: read()
|
||||
} ?: doRead()
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
package de.itkl.fileprocessing
|
||||
|
||||
import de.itkl.core_api.interfaces.AbstractResource
|
||||
import io.ktor.http.*
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.name
|
||||
|
||||
class FileResource(override val path: Path) : AbstractResource() {
|
||||
constructor(file: File): this(file.toPath())
|
||||
override val length: Long by lazy { path.toFile().length() }
|
||||
override val file: File?
|
||||
get() = path.toFile()
|
||||
|
||||
override fun doRead(): InputStream {
|
||||
return Files.newInputStream(path)
|
||||
}
|
||||
override val filename: String
|
||||
get() = path.name
|
||||
override val contentType: ContentType
|
||||
get() = ContentType.fromFilePath(path.name).first()
|
||||
}
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
package de.itkl.fileprocessing
|
||||
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
|
||||
interface ProgressBarFactory {
|
||||
fun new(resource: Resource): ProgressBar
|
||||
fun new(name: String, max: Long): ProgressBar
|
||||
|
|
|
|||
|
|
@ -1,22 +1,14 @@
|
|||
package de.itkl.fileprocessing
|
||||
|
||||
import de.itkl.core_api.interfaces.AbstractResource
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import io.ktor.http.*
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.name
|
||||
|
||||
interface Resource {
|
||||
val path: Path
|
||||
val size: Long
|
||||
val filename: String
|
||||
fun toFile(): File = path.toFile()
|
||||
|
||||
fun length() = path.toFile().length()
|
||||
|
||||
fun read(): InputStream
|
||||
}
|
||||
|
||||
class ProgressResource(
|
||||
private val resource: Resource,
|
||||
private val progressBarFactory: ProgressBarFactory
|
||||
|
|
@ -29,14 +21,3 @@ class ProgressResource(
|
|||
)
|
||||
}
|
||||
}
|
||||
|
||||
class FileResource(override val path: Path) : Resource {
|
||||
constructor(file: File): this(file.toPath())
|
||||
override val size: Long by lazy { path.toFile().length() }
|
||||
override val filename: String
|
||||
get() = path.name
|
||||
|
||||
override fun read(): InputStream {
|
||||
return Files.newInputStream(path)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
dependencies {
|
||||
api(project(":libraries:core-api"))
|
||||
api("org.apache.lucene:lucene-analysis-common:9.9.0")
|
||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||
implementation("com.google.guava:guava:32.1.3-jre")
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package de.itkl.textprocessing
|
|||
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
|
||||
|
|
@ -16,9 +17,9 @@ class HistogramCsvStorage {
|
|||
}
|
||||
}
|
||||
}
|
||||
suspend fun read(file: File): Histogram {
|
||||
suspend fun read(resource: Resource): Histogram {
|
||||
return csvReader { }
|
||||
.openAsync(file) {
|
||||
.openAsync(resource.read()) {
|
||||
val sequence = readAllWithHeaderAsSequence()
|
||||
Histogram.from(sequence)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
dependencies {
|
||||
api(project(":libraries:textprocessing"))
|
||||
api(project(":libraries:fileprocessing"))
|
||||
api(project(":libraries:core-api"))
|
||||
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||
implementation("com.google.guava:guava:32.1.3-jre")
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import de.itkl.fileprocessing.FileProcessor
|
||||
import de.itkl.fileprocessing.Resource
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.processing.parallelUnordered
|
||||
import de.itkl.textprocessing.*
|
||||
import de.itkl.textprocessing.interfaces.Stemmer
|
||||
|
|
@ -24,8 +24,8 @@ class DocumentFrequency : FileProcessor, KoinComponent {
|
|||
}
|
||||
|
||||
override suspend fun process(resource: Resource): File = coroutineScope {
|
||||
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
||||
val resultFile = willProduce(resource.path).toFile()
|
||||
Log.info { "Would produce: ${willProduce(resource.path!!)}" }
|
||||
val resultFile = willProduce(resource.path!!).toFile()
|
||||
val (numDocs, histogram) = TextFile(resource.read())
|
||||
.splitByEmptyLines()
|
||||
.withIndex()
|
||||
|
|
|
|||
|
|
@ -1,43 +1,39 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||
import de.itkl.fileprocessing.FileProcessor
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import de.itkl.fileprocessing.Resource
|
||||
import de.itkl.textprocessing.HistogramCsvStorage
|
||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
||||
import org.koin.core.component.KoinComponent
|
||||
import org.koin.core.component.inject
|
||||
import java.io.File
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.nameWithoutExtension
|
||||
import kotlin.math.ln
|
||||
import kotlin.math.log
|
||||
import kotlin.math.log10
|
||||
import kotlin.math.log2
|
||||
|
||||
private val Log = KotlinLogging.logger { }
|
||||
|
||||
class InverseDocumentFrequency : FileProcessor, KoinComponent {
|
||||
override fun willProduce(path: Path): Path {
|
||||
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
|
||||
}
|
||||
|
||||
|
||||
override suspend fun process(resource: Resource): File {
|
||||
val histogram = HistogramCsvStorage().read(resource.toFile())
|
||||
val histogram = HistogramCsvStorage().read(resource)
|
||||
val numDocs = histogram
|
||||
.find { (word, count) -> word == "\$numDocs" }!!
|
||||
.find { (word, _) -> word == "\$numDocs" }!!
|
||||
.second.toInt()
|
||||
val progressBarFactory: ProgressBarFactory by inject()
|
||||
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
|
||||
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
|
||||
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progress ->
|
||||
csvWriter().openAsync(willProduce(resource.path!!).toFile(), append = false) {
|
||||
writeRow("word", "idf")
|
||||
histogram.forEach { (word, count) ->
|
||||
writeRow(word, idf(numDocs, count))
|
||||
progess.step()
|
||||
progress.step()
|
||||
}
|
||||
}
|
||||
resource.path.toFile()
|
||||
resource.path!!.toFile()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,9 +3,9 @@ package de.itkl.tfidf
|
|||
import com.github.ajalt.mordant.animation.ProgressAnimation
|
||||
import com.github.ajalt.mordant.animation.progressAnimation
|
||||
import com.github.ajalt.mordant.terminal.Terminal
|
||||
import de.itkl.core_api.interfaces.Resource
|
||||
import de.itkl.fileprocessing.ProgressBar
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import de.itkl.fileprocessing.Resource
|
||||
|
||||
class TerminalProgressBarFactory : ProgressBarFactory {
|
||||
private val terminal = Terminal()
|
||||
|
|
@ -17,7 +17,7 @@ class TerminalProgressBarFactory : ProgressBarFactory {
|
|||
completed()
|
||||
timeRemaining()
|
||||
}
|
||||
return TerminalProgressBar(animation, resource.length())
|
||||
return TerminalProgressBar(animation, resource.length!!)
|
||||
}
|
||||
|
||||
override fun new(name: String, max: Long): ProgressBar {
|
||||
|
|
|
|||
|
|
@ -1,9 +1,7 @@
|
|||
package de.itkl.tfidf
|
||||
|
||||
import de.itkl.core_api.interfaces.FileProcessor
|
||||
import de.itkl.fileprocessing.FileProcessingPipeline
|
||||
import de.itkl.fileprocessing.FileProcessor
|
||||
import de.itkl.fileprocessing.ProgressBarFactory
|
||||
import org.koin.core.component.KoinComponent
|
||||
|
||||
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
||||
override val fileProcessor = listOf<FileProcessor>(
|
||||
|
|
|
|||
Loading…
Reference in New Issue