adding core api
parent
2deaa204c5
commit
6971e00221
|
|
@ -0,0 +1,36 @@
|
||||||
|
<component name="ProjectRunConfigurationManager">
|
||||||
|
<configuration default="false" name="docthor [clean]" type="GradleRunConfiguration" factoryName="Gradle" nameIsGenerated="true">
|
||||||
|
<ExternalSystemSettings>
|
||||||
|
<option name="executionName" />
|
||||||
|
<option name="externalProjectPath" value="$PROJECT_DIR$" />
|
||||||
|
<option name="externalSystemIdString" value="GRADLE" />
|
||||||
|
<option name="scriptParameters" value="" />
|
||||||
|
<option name="taskDescriptions">
|
||||||
|
<list />
|
||||||
|
</option>
|
||||||
|
<option name="taskNames">
|
||||||
|
<list>
|
||||||
|
<option value="clean" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
<option name="vmOptions" />
|
||||||
|
</ExternalSystemSettings>
|
||||||
|
<ExternalSystemDebugServerProcess>true</ExternalSystemDebugServerProcess>
|
||||||
|
<ExternalSystemReattachDebugProcess>true</ExternalSystemReattachDebugProcess>
|
||||||
|
<EXTENSION ID="com.intellij.execution.ExternalSystemRunConfigurationJavaExtension">
|
||||||
|
<extension name="net.ashald.envfile">
|
||||||
|
<option name="IS_ENABLED" value="false" />
|
||||||
|
<option name="IS_SUBST" value="false" />
|
||||||
|
<option name="IS_PATH_MACRO_SUPPORTED" value="false" />
|
||||||
|
<option name="IS_IGNORE_MISSING_FILES" value="false" />
|
||||||
|
<option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
|
||||||
|
<ENTRIES>
|
||||||
|
<ENTRY IS_ENABLED="true" PARSER="runconfig" IS_EXECUTABLE="false" />
|
||||||
|
</ENTRIES>
|
||||||
|
</extension>
|
||||||
|
</EXTENSION>
|
||||||
|
<DebugAllEnabled>false</DebugAllEnabled>
|
||||||
|
<RunAsTest>false</RunAsTest>
|
||||||
|
<method v="2" />
|
||||||
|
</configuration>
|
||||||
|
</component>
|
||||||
|
|
@ -21,4 +21,7 @@ All libraries should be placed unter <path>libraries</path>
|
||||||
<def title="io">
|
<def title="io">
|
||||||
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
|
Abstraction about reading/writing to resources (filesystem, http, s3, etc pp)
|
||||||
</def>
|
</def>
|
||||||
|
<def title="core-api">
|
||||||
|
Defines the core interfaces
|
||||||
|
</def>
|
||||||
</deflist>
|
</deflist>
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
package de.itkl.core_api.interfaces
|
package de.itkl.core_api.interfaces
|
||||||
|
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.io.InputStream
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
||||||
interface FileProcessor {
|
interface FileProcessor {
|
||||||
|
|
|
||||||
|
|
@ -3,20 +3,31 @@ package de.itkl.core_api.interfaces
|
||||||
import io.ktor.http.*
|
import io.ktor.http.*
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
import org.koin.core.component.get
|
import org.koin.core.component.get
|
||||||
|
import java.io.File
|
||||||
import java.io.InputStream
|
import java.io.InputStream
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
abstract class Resource : KoinComponent {
|
interface Resource : KoinComponent {
|
||||||
abstract val filename: String
|
val filename: String
|
||||||
abstract val contentType: ContentType
|
val contentType: ContentType
|
||||||
abstract val length: Long?
|
val length: Long?
|
||||||
|
val file: File?
|
||||||
|
val path: Path?
|
||||||
|
fun read(): InputStream
|
||||||
|
}
|
||||||
|
|
||||||
protected abstract fun doRead(): InputStream
|
/**
|
||||||
fun read(): InputStream {
|
* Automatically adds koin injectable decorators to reading/writing
|
||||||
|
* operations
|
||||||
|
*/
|
||||||
|
abstract class AbstractResource : Resource, KoinComponent {
|
||||||
|
abstract fun doRead(): InputStream
|
||||||
|
final override fun read(): InputStream {
|
||||||
return length?.let { length ->
|
return length?.let { length ->
|
||||||
get<ResourceReadDecorator>().decorate(
|
get<ResourceReadDecorator>().decorate(
|
||||||
length = length,
|
length = length,
|
||||||
read()
|
doRead()
|
||||||
)
|
)
|
||||||
} ?: read()
|
} ?: doRead()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
package de.itkl.fileprocessing
|
||||||
|
|
||||||
|
import de.itkl.core_api.interfaces.AbstractResource
|
||||||
|
import io.ktor.http.*
|
||||||
|
import java.io.File
|
||||||
|
import java.io.InputStream
|
||||||
|
import java.nio.file.Files
|
||||||
|
import java.nio.file.Path
|
||||||
|
import kotlin.io.path.name
|
||||||
|
|
||||||
|
class FileResource(override val path: Path) : AbstractResource() {
|
||||||
|
constructor(file: File): this(file.toPath())
|
||||||
|
override val length: Long by lazy { path.toFile().length() }
|
||||||
|
override val file: File?
|
||||||
|
get() = path.toFile()
|
||||||
|
|
||||||
|
override fun doRead(): InputStream {
|
||||||
|
return Files.newInputStream(path)
|
||||||
|
}
|
||||||
|
override val filename: String
|
||||||
|
get() = path.name
|
||||||
|
override val contentType: ContentType
|
||||||
|
get() = ContentType.fromFilePath(path.name).first()
|
||||||
|
}
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
package de.itkl.fileprocessing
|
package de.itkl.fileprocessing
|
||||||
|
|
||||||
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
|
||||||
interface ProgressBarFactory {
|
interface ProgressBarFactory {
|
||||||
fun new(resource: Resource): ProgressBar
|
fun new(resource: Resource): ProgressBar
|
||||||
fun new(name: String, max: Long): ProgressBar
|
fun new(name: String, max: Long): ProgressBar
|
||||||
|
|
|
||||||
|
|
@ -1,22 +1,14 @@
|
||||||
package de.itkl.fileprocessing
|
package de.itkl.fileprocessing
|
||||||
|
|
||||||
|
import de.itkl.core_api.interfaces.AbstractResource
|
||||||
|
import de.itkl.core_api.interfaces.Resource
|
||||||
|
import io.ktor.http.*
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.io.InputStream
|
import java.io.InputStream
|
||||||
import java.nio.file.Files
|
import java.nio.file.Files
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.name
|
import kotlin.io.path.name
|
||||||
|
|
||||||
interface Resource {
|
|
||||||
val path: Path
|
|
||||||
val size: Long
|
|
||||||
val filename: String
|
|
||||||
fun toFile(): File = path.toFile()
|
|
||||||
|
|
||||||
fun length() = path.toFile().length()
|
|
||||||
|
|
||||||
fun read(): InputStream
|
|
||||||
}
|
|
||||||
|
|
||||||
class ProgressResource(
|
class ProgressResource(
|
||||||
private val resource: Resource,
|
private val resource: Resource,
|
||||||
private val progressBarFactory: ProgressBarFactory
|
private val progressBarFactory: ProgressBarFactory
|
||||||
|
|
@ -29,14 +21,3 @@ class ProgressResource(
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class FileResource(override val path: Path) : Resource {
|
|
||||||
constructor(file: File): this(file.toPath())
|
|
||||||
override val size: Long by lazy { path.toFile().length() }
|
|
||||||
override val filename: String
|
|
||||||
get() = path.name
|
|
||||||
|
|
||||||
override fun read(): InputStream {
|
|
||||||
return Files.newInputStream(path)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
dependencies {
|
dependencies {
|
||||||
|
api(project(":libraries:core-api"))
|
||||||
api("org.apache.lucene:lucene-analysis-common:9.9.0")
|
api("org.apache.lucene:lucene-analysis-common:9.9.0")
|
||||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||||
implementation("com.google.guava:guava:32.1.3-jre")
|
implementation("com.google.guava:guava:32.1.3-jre")
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package de.itkl.textprocessing
|
||||||
|
|
||||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
|
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
|
||||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||||
|
import de.itkl.core_api.interfaces.Resource
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
|
@ -16,9 +17,9 @@ class HistogramCsvStorage {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
suspend fun read(file: File): Histogram {
|
suspend fun read(resource: Resource): Histogram {
|
||||||
return csvReader { }
|
return csvReader { }
|
||||||
.openAsync(file) {
|
.openAsync(resource.read()) {
|
||||||
val sequence = readAllWithHeaderAsSequence()
|
val sequence = readAllWithHeaderAsSequence()
|
||||||
Histogram.from(sequence)
|
Histogram.from(sequence)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
dependencies {
|
dependencies {
|
||||||
api(project(":libraries:textprocessing"))
|
api(project(":libraries:textprocessing"))
|
||||||
api(project(":libraries:fileprocessing"))
|
api(project(":libraries:fileprocessing"))
|
||||||
|
api(project(":libraries:core-api"))
|
||||||
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
implementation("com.github.ajalt.mordant:mordant:2.2.0")
|
||||||
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.2")
|
||||||
implementation("com.google.guava:guava:32.1.3-jre")
|
implementation("com.google.guava:guava:32.1.3-jre")
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
package de.itkl.tfidf
|
package de.itkl.tfidf
|
||||||
|
|
||||||
import de.itkl.fileprocessing.FileProcessor
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
import de.itkl.fileprocessing.Resource
|
import de.itkl.core_api.interfaces.Resource
|
||||||
import de.itkl.processing.parallelUnordered
|
import de.itkl.processing.parallelUnordered
|
||||||
import de.itkl.textprocessing.*
|
import de.itkl.textprocessing.*
|
||||||
import de.itkl.textprocessing.interfaces.Stemmer
|
import de.itkl.textprocessing.interfaces.Stemmer
|
||||||
|
|
@ -24,8 +24,8 @@ class DocumentFrequency : FileProcessor, KoinComponent {
|
||||||
}
|
}
|
||||||
|
|
||||||
override suspend fun process(resource: Resource): File = coroutineScope {
|
override suspend fun process(resource: Resource): File = coroutineScope {
|
||||||
Log.info { "Would produce: ${willProduce(resource.path)}" }
|
Log.info { "Would produce: ${willProduce(resource.path!!)}" }
|
||||||
val resultFile = willProduce(resource.path).toFile()
|
val resultFile = willProduce(resource.path!!).toFile()
|
||||||
val (numDocs, histogram) = TextFile(resource.read())
|
val (numDocs, histogram) = TextFile(resource.read())
|
||||||
.splitByEmptyLines()
|
.splitByEmptyLines()
|
||||||
.withIndex()
|
.withIndex()
|
||||||
|
|
|
||||||
|
|
@ -1,43 +1,39 @@
|
||||||
package de.itkl.tfidf
|
package de.itkl.tfidf
|
||||||
|
|
||||||
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
|
||||||
import de.itkl.fileprocessing.FileProcessor
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
|
import de.itkl.core_api.interfaces.Resource
|
||||||
import de.itkl.fileprocessing.ProgressBarFactory
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
import de.itkl.fileprocessing.Resource
|
|
||||||
import de.itkl.textprocessing.HistogramCsvStorage
|
import de.itkl.textprocessing.HistogramCsvStorage
|
||||||
import io.github.oshai.kotlinlogging.KotlinLogging
|
|
||||||
import org.koin.core.component.KoinComponent
|
import org.koin.core.component.KoinComponent
|
||||||
import org.koin.core.component.inject
|
import org.koin.core.component.inject
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.nameWithoutExtension
|
import kotlin.io.path.nameWithoutExtension
|
||||||
import kotlin.math.ln
|
|
||||||
import kotlin.math.log
|
|
||||||
import kotlin.math.log10
|
import kotlin.math.log10
|
||||||
import kotlin.math.log2
|
|
||||||
|
|
||||||
private val Log = KotlinLogging.logger { }
|
|
||||||
|
|
||||||
class InverseDocumentFrequency : FileProcessor, KoinComponent {
|
class InverseDocumentFrequency : FileProcessor, KoinComponent {
|
||||||
override fun willProduce(path: Path): Path {
|
override fun willProduce(path: Path): Path {
|
||||||
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
|
return path.parent.resolve(path.nameWithoutExtension + "-inverse-document-frequency.csv")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
override suspend fun process(resource: Resource): File {
|
override suspend fun process(resource: Resource): File {
|
||||||
val histogram = HistogramCsvStorage().read(resource.toFile())
|
val histogram = HistogramCsvStorage().read(resource)
|
||||||
val numDocs = histogram
|
val numDocs = histogram
|
||||||
.find { (word, count) -> word == "\$numDocs" }!!
|
.find { (word, _) -> word == "\$numDocs" }!!
|
||||||
.second.toInt()
|
.second.toInt()
|
||||||
val progressBarFactory: ProgressBarFactory by inject()
|
val progressBarFactory: ProgressBarFactory by inject()
|
||||||
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progess ->
|
return progressBarFactory.new("compute idf", histogram.size.toLong()).use { progress ->
|
||||||
csvWriter().openAsync(willProduce(resource.path).toFile(), append = false) {
|
csvWriter().openAsync(willProduce(resource.path!!).toFile(), append = false) {
|
||||||
writeRow("word", "idf")
|
writeRow("word", "idf")
|
||||||
histogram.forEach { (word, count) ->
|
histogram.forEach { (word, count) ->
|
||||||
writeRow(word, idf(numDocs, count))
|
writeRow(word, idf(numDocs, count))
|
||||||
progess.step()
|
progress.step()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
resource.path.toFile()
|
resource.path!!.toFile()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,9 @@ package de.itkl.tfidf
|
||||||
import com.github.ajalt.mordant.animation.ProgressAnimation
|
import com.github.ajalt.mordant.animation.ProgressAnimation
|
||||||
import com.github.ajalt.mordant.animation.progressAnimation
|
import com.github.ajalt.mordant.animation.progressAnimation
|
||||||
import com.github.ajalt.mordant.terminal.Terminal
|
import com.github.ajalt.mordant.terminal.Terminal
|
||||||
|
import de.itkl.core_api.interfaces.Resource
|
||||||
import de.itkl.fileprocessing.ProgressBar
|
import de.itkl.fileprocessing.ProgressBar
|
||||||
import de.itkl.fileprocessing.ProgressBarFactory
|
import de.itkl.fileprocessing.ProgressBarFactory
|
||||||
import de.itkl.fileprocessing.Resource
|
|
||||||
|
|
||||||
class TerminalProgressBarFactory : ProgressBarFactory {
|
class TerminalProgressBarFactory : ProgressBarFactory {
|
||||||
private val terminal = Terminal()
|
private val terminal = Terminal()
|
||||||
|
|
@ -17,7 +17,7 @@ class TerminalProgressBarFactory : ProgressBarFactory {
|
||||||
completed()
|
completed()
|
||||||
timeRemaining()
|
timeRemaining()
|
||||||
}
|
}
|
||||||
return TerminalProgressBar(animation, resource.length())
|
return TerminalProgressBar(animation, resource.length!!)
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun new(name: String, max: Long): ProgressBar {
|
override fun new(name: String, max: Long): ProgressBar {
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,7 @@
|
||||||
package de.itkl.tfidf
|
package de.itkl.tfidf
|
||||||
|
|
||||||
|
import de.itkl.core_api.interfaces.FileProcessor
|
||||||
import de.itkl.fileprocessing.FileProcessingPipeline
|
import de.itkl.fileprocessing.FileProcessingPipeline
|
||||||
import de.itkl.fileprocessing.FileProcessor
|
|
||||||
import de.itkl.fileprocessing.ProgressBarFactory
|
|
||||||
import org.koin.core.component.KoinComponent
|
|
||||||
|
|
||||||
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
class TfIdfPipeline(force: Boolean) : FileProcessingPipeline(force) {
|
||||||
override val fileProcessor = listOf<FileProcessor>(
|
override val fileProcessor = listOf<FileProcessor>(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue