metadata-extractor: init

This commit is contained in:
Harsh Shandilya 2022-08-05 22:32:22 +05:30
parent 3862b35f7b
commit 319838c737
No known key found for this signature in database
GPG key ID: 366D7BBAD1031E80
5 changed files with 71 additions and 1 deletions

View file

@ -0,0 +1,14 @@
@file:Suppress("DSL_SCOPE_VIOLATION", "UnstableApiUsage")
plugins {
kotlin("jvm")
id("dev.msfjarvis.claw.kotlin-library")
}
dependencies {
implementation(libs.crux)
implementation(libs.dagger.hilt.core)
implementation(libs.jsoup)
implementation(libs.kotlinx.coroutines.core)
implementation(libs.okhttp.core)
}

View file

@ -0,0 +1,7 @@
package dev.msfjarvis.claw.metadata
data class LinkMetadata(
val url: String,
val faviconUrl: String?,
val readingTime: String?,
)

View file

@ -0,0 +1,43 @@
package dev.msfjarvis.claw.metadata
import com.chimbori.crux.Crux
import com.chimbori.crux.api.Fields.DURATION_MS
import com.chimbori.crux.api.Fields.FAVICON_URL
import javax.inject.Inject
import okhttp3.HttpUrl.Companion.toHttpUrlOrNull
import okhttp3.OkHttpClient
import okhttp3.Request
import org.jsoup.Jsoup
class MetadataExtractor
@Inject
constructor(
private val crux: Crux,
private val okHttpClient: OkHttpClient,
) {
suspend fun getExtractedMetadata(url: String): LinkMetadata {
val parsedUrl = url.toHttpUrlOrNull() ?: return makeDefault(url)
val request = Request.Builder().url(parsedUrl).build()
val htmlContent =
okHttpClient.newCall(request).execute().use { response ->
val body = response.body ?: return makeDefault(url)
body.string()
}
val extractedMetadata = crux.extractFrom(parsedUrl, Jsoup.parse(htmlContent, url))
val faviconUrl = extractedMetadata.urls[FAVICON_URL].toString()
val readingTime = extractedMetadata[DURATION_MS]
return LinkMetadata(
url = url,
faviconUrl = faviconUrl,
readingTime = readingTime,
)
}
private fun makeDefault(url: String) =
LinkMetadata(
url,
null,
null,
)
}