Add url bot plugin based on url-bot-rs (only the difference from their repo)
authorunc0rr
Sat, 12 Jun 2021 20:15:42 +0200
changeset 15790 efe4e3290870
parent 15789 d97ea528ce95
child 15791 2528e3508bf4
Add url bot plugin based on url-bot-rs (only the difference from their repo)
tools/ubot-plugins/url-bot-rs/Cargo.toml
tools/ubot-plugins/url-bot-rs/src/bin/ubot-url-plugin.rs
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ubot-plugins/url-bot-rs/Cargo.toml	Sat Jun 12 20:15:42 2021 +0200
@@ -0,0 +1,88 @@
+[package]
+name = "url-bot-rs"
+version = "0.3.1"
+description = "Minimal IRC URL bot in Rust"
+keywords = ["irc", "bot", "title"]
+repository = "https://github.com/nuxeh/url-bot-rs"
+authors = ["Edward Cragg <drq.11235@gmail.com>"]
+license = "ISC"
+readme = "README.md"
+build = "build.rs"
+edition = "2018"
+include = [
+  "src/**/*",
+  "build.rs",
+  "Cargo.*",
+  "README.md",
+  "COPYING",
+  "example.config.toml"
+]
+
+[build-dependencies]
+built = { version = "0.4.4", features = ["git2"] }
+man = "0.3.0"
+
+[dev-dependencies]
+tiny_http = "0.8.0"
+diff = "0.1.12"
+tempfile = "3.2.0"
+
+[dependencies]
+irc = "0.13.6"
+tokio-core = "0.1.18"
+rusqlite = "0.14.0"
+chrono = "0.4.19"
+docopt = "1.1.0"
+serde = "1.0.123"
+serde_derive = "1.0.104"
+itertools = "0.10.0"
+regex = "1.4.3"
+lazy_static = "1.4.0"
+failure = "0.1.8"
+reqwest = { version = "0.11.0", features = ["blocking", "cookies", "json"] }
+serde_rusqlite = "0.14.0"
+mime = "0.3.16"
+humansize = "1.1.0"
+unicode-segmentation = "1.7.1"
+toml = "0.5.8"
+directories = "3.0.1"
+log = "0.4.13"
+stderrlog = "0.5.1"
+atty = "0.2.14"
+scraper = { version = "0.12.0", default-features = false, features = [] }
+phf = "0.7.24"
+
+anyhow = "1.0"
+tokio-amqp = "1.0"
+lapin = "1.7"
+tokio = {version="1.6", features = ["full"]}
+futures = "0.3"
+url = "2.2"
+rand = "0.8"
+
+[dependencies.image]
+version = "0.22.5"
+default-features = false
+features = ["gif_codec", "jpeg", "png_codec", "pnm", "tiff", "bmp"]
+
+[features]
+default = []
+sqlite_bundled = ["rusqlite/bundled"]
+
+[package.metadata.deb]
+extended-description = """\
+Standalone IRC bot; for resolving URLs posted, retrieving, and posting page
+titles to a configurable IRC server and channels"""
+maintainer-scripts = "debian"
+assets = [
+    ["example.config.toml", "usr/share/doc/url-bot-rs/", "644"],
+    ["target/assets/url-bot-rs.1", "usr/local/share/man/man1/", "644"],
+    ["systemd/url-bot-rs.service", "lib/systemd/system/", "644"],
+    ["target/release/url-bot-rs", "usr/bin/", "755"],
+    ["target/release/url-bot-get", "usr/bin/", "755"]
+]
+
+[badges]
+coveralls = { repository = "nuxeh/url-bot-rs", branch = "master", service = "github" }
+codecov = { repository = "nuxeh/url-bot-rs", branch = "master" }
+travis-ci = { repository = "nuxeh/url-bot-rs", branch = "master" }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ubot-plugins/url-bot-rs/src/bin/ubot-url-plugin.rs	Sat Jun 12 20:15:42 2021 +0200
@@ -0,0 +1,331 @@
+use url_bot_rs::config::Rtd;
+use url_bot_rs::VERSION;
+use url_bot_rs::{feat, http::resolve_url, param, plugins::TITLE_PLUGINS, tld::TLD};
+
+use anyhow::Result as AHResult;
+use atty::{is, Stream};
+use directories::ProjectDirs;
+use docopt::Docopt;
+use failure::Error;
+use lazy_static::lazy_static;
+use log::{error, info};
+use regex::Regex;
+use reqwest::Url;
+use serde_derive::Deserialize;
+use std::collections::HashSet;
+use std::path::PathBuf;
+use stderrlog::{ColorChoice, Timestamp};
+
+use lapin::{options::*, types::FieldTable, BasicProperties, Connection, ConnectionProperties};
+use tokio_amqp::*;
+
+use futures::prelude::*;
+
+use rand::distributions::Alphanumeric;
+use rand::{thread_rng, Rng};
+
+use std::sync::mpsc;
+use std::thread;
+
+// docopt usage string
+const USAGE: &str = "
+URL munching IRC bot.
+
+Usage:
+    ubot-url-plugin [options] [-v...] [--conf=PATH...] [--conf-dir=DIR...]
+
+Options:
+    -h --help           Show this help message.
+    --version           Print version.
+    -v --verbose        Show extra information.
+    -t --timestamp      Force timestamps.
+";
+
+#[derive(Debug, Deserialize, Default)]
+pub struct Args {
+    flag_verbose: usize,
+    flag_conf: Vec<PathBuf>,
+    flag_conf_dir: Vec<PathBuf>,
+    flag_timestamp: bool,
+}
+
+const MIN_VERBOSITY: usize = 2;
+
+#[derive(Debug, PartialEq)]
+enum TitleResp {
+    Title(String),
+    Error(String),
+}
+
+/// Run available plugins on a single URL, return the first successful title.
+fn process_plugins(rtd: &Rtd, url: &Url) -> Option<String> {
+    let result: String = TITLE_PLUGINS
+        .iter()
+        .filter(|p| p.check(&rtd.conf.plugins, url))
+        .filter_map(|p| p.evaluate(&rtd, url).ok())
+        .take(1)
+        .collect();
+
+    if result.is_empty() {
+        None
+    } else {
+        Some(result)
+    }
+}
+
+/// find titles in a message and generate responses
+fn process_titles(rtd: &Rtd, msg: &str) -> impl Iterator<Item = TitleResp> {
+    let mut responses: Vec<TitleResp> = vec![];
+
+    let mut num_processed = 0;
+    let mut dedup_urls = HashSet::new();
+
+    // look at each space-separated message token
+    for token in msg.split_whitespace() {
+        // the token must not contain unsafe characters
+        if contains_unsafe_chars(token) {
+            continue;
+        }
+
+        // get a full URL for tokens without a scheme
+        let maybe_token = if feat!(rtd, partial_urls) {
+            add_scheme_for_tld(token)
+        } else {
+            None
+        };
+
+        let token = maybe_token.as_ref().map_or(token, String::as_str);
+
+        // the token must be a valid url
+        let url = match token.parse::<Url>() {
+            Ok(url) => url,
+            _ => continue,
+        };
+
+        // the scheme must be http or https
+        if !["http", "https"].contains(&url.scheme()) {
+            continue;
+        }
+
+        // skip duplicate urls within the message
+        if dedup_urls.contains(&url) {
+            continue;
+        }
+
+        info!("[{}] RESOLVE <{}>", rtd.conf.network.name, token);
+
+        // try to get the title from the url
+        let title = if let Some(title) = process_plugins(rtd, &url) {
+            title
+        } else {
+            match resolve_url(token, rtd) {
+                Ok(title) => title,
+                Err(err) => {
+                    error!("{:?}", err);
+                    responses.push(TitleResp::Error(err.to_string()));
+                    continue;
+                }
+            }
+        };
+
+        // limit response length, see RFC1459
+
+        let msg = utf8_truncate(&format!("⤷ {}", title), 510);
+
+        info!("[{}] {}", rtd.conf.network.name, msg);
+
+        responses.push(TitleResp::Title(msg.to_string()));
+
+        dedup_urls.insert(url);
+
+        // limit the number of processed URLs
+        num_processed += 1;
+        if num_processed == param!(rtd, url_limit) {
+            break;
+        }
+    }
+
+    responses.into_iter()
+}
+
+// regex for unsafe characters, as defined in RFC 1738
+const RE_UNSAFE_CHARS: &str = r#"[{}|\\^~\[\]`<>"]"#;
+
+/// does the token contain characters not permitted by RFC 1738
+fn contains_unsafe_chars(token: &str) -> bool {
+    lazy_static! {
+        static ref UNSAFE: Regex = Regex::new(RE_UNSAFE_CHARS).unwrap();
+    }
+    UNSAFE.is_match(token)
+}
+
+/// truncate to a maximum number of bytes, taking UTF-8 into account
+fn utf8_truncate(s: &str, n: usize) -> String {
+    s.char_indices()
+        .take_while(|(len, c)| len + c.len_utf8() <= n)
+        .map(|(_, c)| c)
+        .collect()
+}
+
+lazy_static! {
+    static ref REPEATED_DOTS: Regex = Regex::new(r"\.\.+").unwrap();
+}
+
+/// if a token has a recognised TLD, but no scheme, add one
+pub fn add_scheme_for_tld(token: &str) -> Option<String> {
+    if token.parse::<Url>().is_err() {
+        if token.starts_with(|s: char| !s.is_alphabetic()) {
+            return None;
+        }
+
+        if REPEATED_DOTS.is_match(&token) {
+            return None;
+        }
+
+        let new_token = format!("http://{}", token);
+
+        if let Ok(url) = new_token.parse::<Url>() {
+            if !url.domain()?.contains('.') {
+                return None;
+            }
+
+            // reject email addresses
+            if url.username() != "" {
+                return None;
+            }
+
+            let tld = url.domain()?.split('.').last()?;
+
+            if TLD.contains(tld) {
+                return Some(new_token);
+            }
+        }
+    }
+
+    None
+}
+
+fn init_rtd() -> AHResult<Rtd, Error> {
+    // parse command line arguments with docopt
+    let args: Args = Docopt::new(USAGE)
+        .and_then(|d| d.version(Some(VERSION.to_string())).deserialize())
+        .unwrap_or_else(|e| e.exit());
+
+    // avoid timestamping when piped, e.g. systemd
+    let timestamp = if is(Stream::Stderr) || args.flag_timestamp {
+        Timestamp::Second
+    } else {
+        Timestamp::Off
+    };
+
+    stderrlog::new()
+        .module(module_path!())
+        .modules(vec![
+            "url_bot_rs::message",
+            "url_bot_rs::config",
+            "url_bot_rs::http",
+        ])
+        .verbosity(args.flag_verbose + MIN_VERBOSITY)
+        .timestamp(timestamp)
+        .color(ColorChoice::Never)
+        .init()
+        .unwrap();
+
+    let dirs = ProjectDirs::from("org", "", "url-bot-rs").unwrap();
+    let default_conf_dir = dirs.config_dir();
+
+    let default_conf = default_conf_dir.join("config.toml");
+
+    let rtd: Rtd = Rtd::new().conf(&default_conf).load()?.init_http_client()?;
+
+    Ok(rtd)
+}
+
+fn random_string(size: usize) -> String {
+    thread_rng()
+        .sample_iter(&Alphanumeric)
+        .take(size)
+        .map(char::from)
+        .collect()
+}
+
+#[tokio::main]
+async fn main() -> AHResult<()> {
+    let (tx1, rx1) = mpsc::channel::<String>();
+    let (tx2, rx2) = mpsc::channel();
+
+    thread::spawn(move || {
+        let rtd = init_rtd().expect("RTD not initialized");
+
+        loop {
+            let message = &rx1.recv().expect("rx1 recv error");
+            let titles: Vec<_> = process_titles(&rtd, message).collect();
+            tx2.send(titles).expect("tx2 send error");
+        }
+    });
+    let amqp_url = std::env::var("AMQP_URL").expect("expected AMQP_URL env variabe");
+    let conn = Connection::connect(&amqp_url, ConnectionProperties::default().with_tokio()).await?;
+
+    let pub_channel = conn.create_channel().await?;
+    let sub_channel = conn.create_channel().await?;
+
+    let queue = sub_channel
+        .queue_declare(
+            &random_string(32),
+            QueueDeclareOptions {
+                exclusive: true,
+                auto_delete: true,
+                ..QueueDeclareOptions::default()
+            },
+            FieldTable::default(),
+        )
+        .await?;
+
+    sub_channel
+        .queue_bind(
+            queue.name().as_str(),
+            "irc",
+            "msg.hedgewars",
+            QueueBindOptions::default(),
+            FieldTable::default(),
+        )
+        .await?;
+
+    let mut subscriber = sub_channel
+        .basic_consume(
+            queue.name().as_str(),
+            &random_string(32),
+            BasicConsumeOptions::default(),
+            FieldTable::default(),
+        )
+        .await?;
+
+    while let Some(amqp_message) = subscriber.next().await {
+        let (_, delivery) = amqp_message.expect("error in consumer");
+        delivery.ack(BasicAckOptions::default()).await?;
+
+        let chat_message = String::from_utf8(delivery.data)?;
+        if let Some((_who, message)) = chat_message.split_once('\n') {
+            tx1.send(message.to_owned())?;
+            let titles = rx2.recv()?;
+
+            for title in titles {
+                let title_message = match title {
+                    TitleResp::Title(t) => t,
+                    TitleResp::Error(e) => e,
+                };
+                pub_channel
+                    .basic_publish(
+                        "irc",
+                        "say.hedgewars",
+                        BasicPublishOptions::default(),
+                        title_message.as_bytes().to_vec(),
+                        BasicProperties::default(),
+                    )
+                    .await?;
+            }
+        }
+    }
+
+    Ok(())
+}