Compare commits

...

2 Commits

Author SHA1 Message Date
Harsh Shandilya d7c0e8574e refactor: match on domains rather than regex 2024-05-05 22:15:43 +05:30
Harsh Shandilya f696f25913 refactor: use typed URLs 2024-05-05 22:06:39 +05:30
7 changed files with 55 additions and 56 deletions

View File

@ -5,7 +5,6 @@ use crate::{
utils::{get_urls_from_message, AsyncError},
};
use model::AMPResponse;
use reqwest::Url;
use std::str::FromStr;
use teloxide::{prelude::Requester, types::Message, utils::html::link, Bot};
use tracing::debug;
@ -31,7 +30,7 @@ pub async fn handler(bot: Bot, message: Message) -> Result<(), AsyncError> {
debug!(?resp, "{url}");
let resp = deserialize_amp_response(&resp)?;
if let AMPResponse::Success(ok) = resp {
text = text.replace(url, &ok[0].canonical.url);
text = text.replace(url.as_str(), &ok[0].canonical.url);
} else {
return Ok(());
}
@ -50,7 +49,7 @@ pub fn is_amp(msg: Message) -> bool {
if urls.is_empty() {
return false;
}
urls.iter().flat_map(|url| Url::parse(url)).any(|url| {
urls.iter().any(|url| {
if let Some(mut segments) = url.path_segments()
&& let Some(host) = url.host_str()
{

View File

@ -8,7 +8,8 @@ use teloxide::{prelude::Requester, types::Message, utils::html::link, Bot};
const HOST_MATCH_GROUP: &str = "host";
pub static MATCH_REGEX: Lazy<Regex> = Lazy::new(|| {
pub const DOMAINS: [&str; 1] = ["instagram.com"];
static MATCH_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new("https://(?:www.)?(?P<host>instagram.com)/(p|reel|tv)/[A-Za-z0-9]+.*/").unwrap()
});

View File

@ -28,6 +28,7 @@ use teloxide::{
update_listeners::Polling,
Bot,
};
use utils::has_matching_urls;
pub(crate) static FIXER_STATE: Lazy<Mutex<HashMap<ChatId, FixerState>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
@ -50,18 +51,13 @@ async fn run() {
)
.branch(
dptree::filter(|msg: Message| {
if let Ok(ref mut map) = FIXER_STATE.try_lock()
let should_match = has_matching_urls(&msg, &twitter::DOMAINS)
&& !msg.text().unwrap_or_default().contains(REPLACE_SKIP_TOKEN);
if should_match
&& let Ok(ref mut map) = FIXER_STATE.try_lock()
&& let Some(chat_id) = msg.chat_id()
{
let state = map.entry(chat_id).or_insert(FixerState::default());
return state.twitter
&& msg
.text()
.map(|text| {
twitter::MATCH_REGEX.is_match(text)
&& !text.contains(REPLACE_SKIP_TOKEN)
})
.unwrap_or_default();
return map.entry(chat_id).or_insert(FixerState::default()).twitter;
}
false
})
@ -70,18 +66,16 @@ async fn run() {
#[cfg(feature = "ddinstagram")]
let handler = handler.branch(
dptree::filter(|msg: Message| {
if let Ok(ref mut map) = FIXER_STATE.try_lock()
let should_match = has_matching_urls(&msg, &instagram::DOMAINS)
&& !msg.text().unwrap_or_default().contains(REPLACE_SKIP_TOKEN);
if should_match
&& let Ok(ref mut map) = FIXER_STATE.try_lock()
&& let Some(chat_id) = msg.chat_id()
{
let state = map.entry(chat_id).or_insert(FixerState::default());
return state.instagram
&& msg
.text()
.map(|text| {
instagram::MATCH_REGEX.is_match(text)
&& !text.contains(REPLACE_SKIP_TOKEN)
})
.unwrap_or_default();
return map
.entry(chat_id)
.or_insert(FixerState::default())
.instagram;
}
false
})
@ -89,18 +83,13 @@ async fn run() {
);
let handler = handler.branch(
dptree::filter(|msg: Message| {
if let Ok(ref mut map) = FIXER_STATE.try_lock()
let should_match = has_matching_urls(&msg, &youtube::DOMAINS)
&& !msg.text().unwrap_or_default().contains(REPLACE_SKIP_TOKEN);
if should_match
&& let Ok(ref mut map) = FIXER_STATE.try_lock()
&& let Some(chat_id) = msg.chat_id()
{
let state = map.entry(chat_id).or_insert(FixerState::default());
return state.youtube
&& msg
.text()
.map(|text| {
youtube::MATCH_REGEX.is_match(text)
&& !text.contains(REPLACE_SKIP_TOKEN)
})
.unwrap_or_default();
return map.entry(chat_id).or_insert(FixerState::default()).youtube;
}
false
})
@ -108,17 +97,14 @@ async fn run() {
);
let handler = handler.branch(
dptree::filter(|msg: Message| {
if let Ok(ref mut map) = FIXER_STATE.try_lock()
let should_match = has_matching_urls(&msg, &medium::DOMAINS);
let should_match =
should_match && !msg.text().unwrap_or_default().contains(REPLACE_SKIP_TOKEN);
if should_match
&& let Ok(ref mut map) = FIXER_STATE.try_lock()
&& let Some(chat_id) = msg.chat_id()
{
let state = map.entry(chat_id).or_insert(FixerState::default());
return state.medium
&& msg
.text()
.map(|text| {
medium::MATCH_REGEX.is_match(text) && !text.contains(REPLACE_SKIP_TOKEN)
})
.unwrap_or_default();
return map.entry(chat_id).or_insert(FixerState::default()).medium;
}
false
})

View File

@ -11,7 +11,8 @@ const HOST_MATCH_GROUP: &str = "host";
const PATH_MATCH_GROUP: &str = "path";
const USER_MATCH_GROUP: &str = "user";
pub static MATCH_REGEX: Lazy<Regex> = Lazy::new(|| {
pub const DOMAINS: [&str; 1] = ["medium.com"];
static MATCH_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new("https://(?<user>[a-zA-Z0-9]*)?.?(?<host>medium.com)/(?<path>.*)").unwrap()
});

View File

@ -9,7 +9,8 @@ use teloxide::{prelude::Requester, types::Message, utils::html::link, Bot};
const HOST_MATCH_GROUP: &str = "host";
const ROOT_MATCH_GROUP: &str = "root";
pub static MATCH_REGEX: Lazy<Regex> = Lazy::new(|| {
pub const DOMAINS: [&str; 2] = ["twitter.com", "x.com"];
static MATCH_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new("https://(?P<host>(?:mobile.)?(?P<root>(twitter|x)).com)/.*/status/[0-9]+.*")
.unwrap()
});

View File

@ -6,7 +6,7 @@ use tracing::{error, info};
pub(crate) type AsyncError = Box<dyn Error + Send + Sync + 'static>;
pub(crate) fn get_urls_from_message(msg: &Message) -> Vec<String> {
pub(crate) fn get_urls_from_message(msg: &Message) -> Vec<Url> {
if let Some(entities) = msg.entities()
&& !entities.is_empty()
&& let Some(text) = msg.text()
@ -24,26 +24,36 @@ pub(crate) fn get_urls_from_message(msg: &Message) -> Vec<String> {
let utf16 = text.encode_utf16().collect::<Vec<u16>>();
let mut urls = Vec::with_capacity(url_entities.len());
for entity in &url_entities {
urls.push(String::from_utf16_lossy(
if let Ok(url) = Url::parse(&String::from_utf16_lossy(
&utf16[entity.offset..entity.offset + entity.length],
));
)) {
urls.push(url);
}
}
info!(message_id = %msg.id.0, ?urls, "get_urls_from_message");
let url_str = urls.iter().map(reqwest::Url::as_str).collect::<Vec<&str>>();
info!(message_id = %msg.id.0, urls = ?url_str, "get_urls_from_message");
return urls;
}
Vec::new()
}
pub(crate) fn has_matching_urls(msg: &Message, domains: &[&str]) -> bool {
get_urls_from_message(msg).iter().any(|url| {
if let Some(host) = url.host_str() {
return domains.iter().any(|domain| host.ends_with(domain));
}
false
})
}
pub(crate) fn scrub_urls(msg: &Message) -> Option<String> {
if let Some(text) = msg.text() {
let urls = get_urls_from_message(msg);
let mut final_text = text.to_owned();
for item in urls {
if let Ok(url) = Url::parse(&item)
&& let Some(query_str) = url.query()
{
let scrubbed_url = item.replace(&format!("?{query_str}"), "");
final_text = final_text.replace(&item, &scrubbed_url);
for url in urls {
if let Some(query_str) = url.query() {
let scrubbed_url = url.as_str().replace(&format!("?{query_str}"), "");
final_text = final_text.replace(url.as_str(), &scrubbed_url);
}
}
info!(?text, ?final_text, "scrub_urls");

View File

@ -6,7 +6,8 @@ use once_cell::sync::Lazy;
use regex::Regex;
use teloxide::{prelude::Requester, types::Message, utils::html::link, Bot};
pub static MATCH_REGEX: Lazy<Regex> = Lazy::new(|| {
pub const DOMAINS: [&str; 1] = ["youtube.com"];
static MATCH_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new("https://(?:www.)?youtube.com/(?P<shorts>shorts/)[A-Za-z0-9-_]{11}.*").unwrap()
});