diff --git a/Cargo.lock b/Cargo.lock index 0b171f5c98..b0176dc168 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -430,7 +430,7 @@ dependencies = [ "ansi_term 0.11.0", "atty", "bitflags", - "strsim 0.8.0", + "strsim", "textwrap 0.11.0", "unicode-width", "vec_map", @@ -533,41 +533,6 @@ dependencies = [ "syn", ] -[[package]] -name = "darling" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9d6ddad5866bb2170686ed03f6839d31a76e5407d80b1c334a2c24618543ffa" -dependencies = [ - "darling_core", - "darling_macro", -] - -[[package]] -name = "darling_core" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9ced1fd13dc386d5a8315899de465708cf34ee2a6d9394654515214e67bb846" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim 0.10.0", - "syn", -] - -[[package]] -name = "darling_macro" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a7a1445d54b2f9792e3b31a3e715feabbace393f38dc4ffd49d94ee9bc487d5" -dependencies = [ - "darling_core", - "quote", - "syn", -] - [[package]] name = "data-encoding" version = "2.3.2" @@ -588,38 +553,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "derive_builder" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ef25735c9f0d0c547d2794701600c94abf030ecb740fad1673fa64461f3573" -dependencies = [ - "derive_builder_core", - "derive_builder_macro", -] - -[[package]] -name = "derive_builder_core" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3150f1e84602847b99d3eeb702487fc364f7d6c94f634e944a68fdbaea09e457" -dependencies = [ - "darling", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "derive_builder_macro" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca1008bddefdc08d1e734aeb27b94f384390af261b4d1a8fb51fe19c577f05c" -dependencies = [ - "derive_builder_core", - "syn", -] - [[package]] name = "diff" version = "0.1.12" @@ -1206,12 +1139,6 @@ dependencies = [ "unicase", ] -[[package]] -name = "ident_case" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" - [[package]] name = "idna" version = "0.2.2" @@ -1425,7 +1352,6 @@ version = "0.6.0" dependencies = [ "check-if-email-exists", "deadpool", - "derive_builder", "doc-comment", "fast_chemail", "glob", @@ -1444,6 +1370,7 @@ dependencies = [ "shellexpand", "tempfile", "tokio", + "typed-builder", "url", "wiremock", ] @@ -2437,12 +2364,6 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - [[package]] name = "structopt" version = "0.3.21" @@ -2735,6 +2656,17 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" +[[package]] +name = "typed-builder" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "345426c7406aa355b60c5007c79a2d1f5b605540072795222f17f6443e6a9c6f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "typenum" version = "1.13.0" diff --git a/README.md b/README.md index 68db9c975f..83f4b2c8ed 100644 --- a/README.md +++ b/README.md @@ -154,8 +154,8 @@ USAGE: lychee [FLAGS] [OPTIONS] [--] [inputs]... FLAGS: - -E, --exclude-all-private Exclude all private IPs from checking. Equivalent to `--exclude-private --exclude-link- - local --exclude-loopback` + -E, --exclude-all-private Exclude all private IPs from checking. + Equivalent to `--exclude-private --exclude-link-local --exclude-loopback` --exclude-link-local Exclude link-local IP address range from checking --exclude-loopback Exclude loopback IP address range from checking --exclude-mail Exclude all mail addresses from checking @@ -163,8 +163,8 @@ FLAGS: --glob-ignore-case Ignore case when expanding filesystem path glob inputs --help Prints help information -i, --insecure Proceed for server connections considered insecure (invalid TLS) - -n, --no-progress Do not show progress bar. This is recommended for non-interactive shells (e.g. for - continuous integration) + -n, --no-progress Do not show progress bar. + This is recommended for non-interactive shells (e.g. for continuous integration) --skip-missing Skip missing input files (default is to error if they don't exist) -V, --version Prints version information -v, --verbose Verbose program output @@ -226,7 +226,7 @@ use lychee_lib::{ClientBuilder, Result, Status}; #[tokio::main] async fn main() -> Result<()> { - let client = ClientBuilder::default().build()?; + let client = ClientBuilder::default().client()?; let response = client.check("https://github.com/lycheeverse/lychee").await?; assert!(response.status().is_success()); Ok(()) @@ -236,7 +236,7 @@ async fn main() -> Result<()> { The client builder is very customizable: ```rust, ignore -let client = lychee_lib::ClientBuilder::default() +let client = lychee_lib::ClientBuilder::builder() .includes(includes) .excludes(excludes) .max_redirects(cfg.max_redirects) @@ -249,7 +249,8 @@ let client = lychee_lib::ClientBuilder::default() .github_token(cfg.github_token) .scheme(cfg.scheme) .accepted(accepted) - .build()?; + .build() + .client()?; ``` All options that you set will be used for all link checks. diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 27f2ceb9c9..059610e802 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -120,7 +120,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { let include = RegexSet::new(&cfg.include)?; let exclude = RegexSet::new(&cfg.exclude)?; - let client = ClientBuilder::default() + let client = ClientBuilder::builder() .includes(include) .excludes(exclude) .exclude_all_private(cfg.exclude_all_private) @@ -138,6 +138,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .scheme(cfg.scheme.clone()) .accepted(accepted) .build() + .client() .map_err(|e| anyhow!(e))?; let links = collect_links( diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 63e2ca57d8..6476ce5842 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -118,9 +118,8 @@ pub(crate) struct Config { pub(crate) verbose: bool, /// Do not show progress bar. - /// This is recommended for non-interactive shells (e.g. for continuous - /// integration) - #[structopt(short, long)] + /// This is recommended for non-interactive shells (e.g. for continuous integration) + #[structopt(short, long, verbatim_doc_comment)] #[serde(default)] pub(crate) no_progress: bool, @@ -167,7 +166,7 @@ pub(crate) struct Config { /// Exclude all private IPs from checking. /// Equivalent to `--exclude-private --exclude-link-local --exclude-loopback` - #[structopt(short = "E", long)] + #[structopt(short = "E", long, verbatim_doc_comment)] #[serde(default)] pub(crate) exclude_all_private: bool, diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index 04dc82ec30..467d0d5c82 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -144,7 +144,7 @@ mod test { .await; ClientBuilder::default() - .build() + .client() .unwrap() .check(mock_server.uri()) .await diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index dee435fc0b..d92e6efbf1 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -19,7 +19,6 @@ version = "0.6.0" [dependencies] check-if-email-exists = "0.8.21" deadpool = "0.7.0" -derive_builder = "0.10.0" fast_chemail = "0.9.6" glob = "0.3.0" html5ever = "0.25.1" @@ -39,6 +38,7 @@ ring = "0.16.20" serde = { version = "1.0.125", features = ["derive"] } shellexpand = "2.1.0" tokio = { version = "1.5.0", features = ["full"] } +typed-builder = "0.9.0" url = { version = "2.2.1", features = ["serde"] } [dev-dependencies] diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index b3661bf179..07b409a65d 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -1,12 +1,12 @@ #![allow( clippy::module_name_repetitions, clippy::struct_excessive_bools, - clippy::default_trait_access + clippy::default_trait_access, + clippy::used_underscore_binding )] use std::{collections::HashSet, convert::TryFrom, time::Duration}; use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; -use derive_builder::Builder; use http::{ header::{HeaderMap, HeaderValue}, StatusCode, @@ -15,6 +15,7 @@ use hubcaps::{Credentials, Github}; use regex::RegexSet; use reqwest::header; use tokio::time::sleep; +use typed_builder::TypedBuilder; use crate::{ filter::{Excludes, Filter, Includes}, @@ -45,11 +46,9 @@ pub struct Client { /// A link checker using an API token for Github links /// otherwise a normal HTTP client. #[allow(unreachable_pub)] -#[derive(Builder, Debug)] -#[builder(build_fn(skip))] -#[builder(setter(into))] -#[builder(name = "ClientBuilder")] -pub struct ClientBuilderInternal { +#[derive(TypedBuilder, Debug)] +#[builder(field_defaults(default, setter(into)))] +pub struct ClientBuilder { /// Set an optional Github token. /// This allows for more requests before /// getting rate-limited. @@ -69,8 +68,12 @@ pub struct ClientBuilderInternal { /// Don't check mail addresses exclude_mail: bool, /// Maximum number of redirects before returning error + #[builder(default = DEFAULT_MAX_REDIRECTS)] max_redirects: usize, /// User agent used for checking links + // Faking the user agent is necessary for some websites, unfortunately. + // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). + #[builder(default_code = "String::from(DEFAULT_USER_AGENT)")] user_agent: String, /// Ignore SSL errors allow_insecure: bool, @@ -83,6 +86,7 @@ pub struct ClientBuilderInternal { /// on some websites. custom_headers: HeaderMap, /// Request method (e.g. `GET` or `HEAD`) + #[builder(default = reqwest::Method::GET)] method: reqwest::Method, /// Set of accepted return codes / status codes accepted: Option>, @@ -90,73 +94,62 @@ pub struct ClientBuilderInternal { timeout: Option, } -impl ClientBuilder { - fn build_excludes(&self) -> Excludes { - // exclude_all_private option turns on all "private" excludes, - // including private IPs, link-local IPs and loopback IPs - let exclude_all_private = matches!(self.exclude_all_private, Some(true)); - let enable_exclude = |opt| exclude_all_private || matches!(opt, Some(true)); - - Excludes { - regex: self.excludes.clone().unwrap_or_default(), - private_ips: enable_exclude(self.exclude_private_ips), - link_local_ips: enable_exclude(self.exclude_link_local_ips), - loopback_ips: enable_exclude(self.exclude_loopback_ips), - mail: self.exclude_mail.unwrap_or_default(), - } +impl Default for ClientBuilder { + fn default() -> Self { + Self::builder().build() } +} - fn build_includes(&self) -> Includes { - let regex = self.includes.clone().flatten(); - Includes { regex } +impl ClientBuilder { + fn build_filter(&self) -> Filter { + let includes = self.includes.clone().map(|regex| Includes { regex }); + let excludes = self.excludes.clone().map(|regex| Excludes { regex }); + let scheme = self.scheme.clone().map(|s| s.to_lowercase()); + + Filter { + includes, + excludes, + scheme, + // exclude_all_private option turns on all "private" excludes, + // including private IPs, link-local IPs and loopback IPs + exclude_private_ips: self.exclude_all_private || self.exclude_private_ips, + exclude_link_local_ips: self.exclude_all_private || self.exclude_link_local_ips, + exclude_loopback_ips: self.exclude_all_private || self.exclude_loopback_ips, + exclude_mail: self.exclude_all_private || self.exclude_mail, + } } /// The build method instantiates the client. #[allow(clippy::missing_errors_doc)] - pub fn build(&self) -> Result { - // Faking the user agent is necessary for some websites, unfortunately. - // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). - let user_agent = self - .user_agent - .clone() - .unwrap_or_else(|| DEFAULT_USER_AGENT.to_owned()); - - let mut headers = self.custom_headers.clone().unwrap_or_default(); - headers.insert(header::USER_AGENT, HeaderValue::from_str(&user_agent)?); + pub fn client(&self) -> Result { + let mut headers = self.custom_headers.clone(); + headers.insert(header::USER_AGENT, HeaderValue::from_str(&self.user_agent)?); headers.insert( header::TRANSFER_ENCODING, HeaderValue::from_static("chunked"), ); - let allow_insecure = self.allow_insecure.unwrap_or(false); - let max_redirects = self.max_redirects.unwrap_or(DEFAULT_MAX_REDIRECTS); - let builder = reqwest::ClientBuilder::new() .gzip(true) .default_headers(headers) - .danger_accept_invalid_certs(allow_insecure) - .redirect(reqwest::redirect::Policy::limited(max_redirects)); - - let timeout = self.timeout.flatten(); + .danger_accept_invalid_certs(self.allow_insecure) + .redirect(reqwest::redirect::Policy::limited(self.max_redirects)); - let reqwest_client = (match timeout { + let reqwest_client = (match self.timeout { Some(t) => builder.timeout(t), None => builder, }) .build()?; - let github_token = match self.github_token.clone().flatten() { - Some(token) if !token.is_empty() => { - Some(Github::new(user_agent, Credentials::Token(token))?) - } + let github_token = match self.github_token { + Some(ref token) if !token.is_empty() => Some(Github::new( + self.user_agent.clone(), + Credentials::Token(token.clone()), + )?), _ => None, }; - let includes = self.build_includes(); - let excludes = self.build_excludes(); - let scheme = self.scheme.clone().flatten().map(|s| s.to_lowercase()); - - let filter = Filter::new(Some(includes), Some(excludes), scheme); + let filter = self.build_filter(); let quirks = Quirks::default(); @@ -164,9 +157,9 @@ impl ClientBuilder { reqwest_client, github_client: github_token, filter, + method: self.method.clone(), + accepted: self.accepted.clone(), quirks, - method: self.method.clone().unwrap_or(reqwest::Method::GET), - accepted: self.accepted.clone().unwrap_or_default(), }) } } @@ -180,7 +173,7 @@ impl Client { let Request { uri, source } = Request::try_from(request)?; let status = if self.filter.is_excluded(&uri) { Status::Excluded - } else if uri.scheme() == "mailto" { + } else if uri.is_mail() { self.check_mail(&uri).await } else { self.check_website(&uri).await @@ -262,7 +255,7 @@ where Request: TryFrom, ErrorKind: From, { - let client = ClientBuilder::default().build()?; + let client = ClientBuilder::builder().build().client()?; Ok(client.check(request).await?) } @@ -345,9 +338,10 @@ mod test { assert!(res.status().is_failure()); // Same, but ignore certificate error - let res = ClientBuilder::default() + let res = ClientBuilder::builder() .allow_insecure(true) .build() + .client() .unwrap() .check("https://expired.badssl.com/") .await @@ -366,9 +360,10 @@ mod test { // See https://github.com/rust-lang/crates.io/issues/788 let mut custom = HeaderMap::new(); custom.insert(header::ACCEPT, "text/html".parse().unwrap()); - let res = ClientBuilder::default() + let res = ClientBuilder::builder() .custom_headers(custom) .build() + .client() .unwrap() .check("https://crates.io/crates/lychee") .await @@ -387,9 +382,10 @@ mod test { let mock_server = mock_server!(StatusCode::OK, set_delay(mock_delay)); - let client = ClientBuilder::default() + let client = ClientBuilder::builder() .timeout(checker_timeout) .build() + .client() .unwrap(); let res = client.check(mock_server.uri()).await.unwrap(); diff --git a/lychee-lib/src/filter/excludes.rs b/lychee-lib/src/filter/excludes.rs index e4878607ee..9e5b19c7bb 100644 --- a/lychee-lib/src/filter/excludes.rs +++ b/lychee-lib/src/filter/excludes.rs @@ -1,76 +1,23 @@ use regex::RegexSet; -use std::net::IpAddr; - -use crate::Uri; - -/// Pre-defined exclusions for known false-positives -static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; /// Exclude configuration for the link checker. -/// You can ignore links based on regex patterns or pre-defined IP ranges. -#[allow(clippy::struct_excessive_bools)] +/// You can ignore links based on regex patterns. #[derive(Clone, Debug)] pub struct Excludes { /// User-defined set of excluded regex patterns - pub regex: Option, - /// Example: 192.168.0.1 - pub private_ips: bool, - /// Example: 169.254.0.0 - pub link_local_ips: bool, - /// For IPv4: 127.0.0.1/8 - /// For IPv6: ::1/128 - pub loopback_ips: bool, - /// Example: octocat@github.com - pub mail: bool, -} - -impl Default for Excludes { - fn default() -> Self { - Self { - regex: None, - private_ips: false, - link_local_ips: false, - loopback_ips: false, - mail: false, - } - } + pub(crate) regex: RegexSet, } impl Excludes { #[inline] #[must_use] - pub fn regex(&self, input: &str) -> bool { - self.regex.as_ref().map_or(false, |re| re.is_match(input)) - } - - #[must_use] - pub fn is_false_positive(input: &str) -> bool { - input == FALSE_POSITIVE_PAT[0] - } - - #[must_use] - pub fn ip(&self, uri: &Uri) -> bool { - match uri.host_ip() { - Some(ip_addr) if self.loopback_ips && ip_addr.is_loopback() => true, - // Note: in a pathological case, an IPv6 address can be IPv4-mapped - // (IPv4 address embedded in a IPv6). We purposefully - // don't deal with it here, and assume if an address is IPv6, - // we shouldn't attempt to map it to IPv4. - // See: https://tools.ietf.org/html/rfc4291#section-2.5.5.2 - Some(IpAddr::V4(v4_addr)) if self.private_ips && v4_addr.is_private() => true, - Some(IpAddr::V4(v4_addr)) if self.link_local_ips && v4_addr.is_link_local() => true, - _ => false, - } + pub fn is_match(&self, input: &str) -> bool { + self.regex.is_match(input) } #[inline] #[must_use] - pub const fn is_mail_excluded(&self) -> bool { - self.mail - } - - #[inline] pub fn is_empty(&self) -> bool { - self.regex.as_ref().map_or(true, RegexSet::is_empty) + self.regex.is_empty() } } diff --git a/lychee-lib/src/filter/includes.rs b/lychee-lib/src/filter/includes.rs index 8a8ac7ce1b..d2c2698e13 100644 --- a/lychee-lib/src/filter/includes.rs +++ b/lychee-lib/src/filter/includes.rs @@ -2,20 +2,22 @@ use regex::RegexSet; /// Include configuration for the link checker. /// You can include links based on regex patterns -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug)] pub struct Includes { - pub regex: Option, + /// User-defined set of included regex patterns + pub regex: RegexSet, } impl Includes { #[inline] #[must_use] - pub fn regex(&self, input: &str) -> bool { - self.regex.as_ref().map_or(false, |re| re.is_match(input)) + pub fn is_match(&self, input: &str) -> bool { + self.regex.is_match(input) } #[inline] + #[must_use] pub fn is_empty(&self) -> bool { - self.regex.as_ref().map_or(true, RegexSet::is_empty) + self.regex.is_empty() } } diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index a6fcb996e9..5d5b942004 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -1,74 +1,147 @@ mod excludes; mod includes; +use std::net::IpAddr; + pub use excludes::Excludes; pub use includes::Includes; use crate::uri::Uri; +/// Pre-defined exclusions for known false-positives +static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; + +#[inline] +#[must_use] +pub fn is_false_positive(input: &str) -> bool { + input == FALSE_POSITIVE_PAT[0] +} + /// A generic URI filter /// Used to decide if a given URI should be checked or skipped +#[allow(clippy::struct_excessive_bools)] #[derive(Clone, Debug, Default)] pub struct Filter { - pub(crate) includes: Includes, - pub(crate) excludes: Excludes, - pub(crate) scheme: Option, + pub includes: Option, + pub excludes: Option, + // TODO: accept multiple scheme + // TODO: includes scheme and excludes scheme + // TODO: excludes_mail should be merged to excludes scheme + // allowed scheme + pub scheme: Option, + /// Example: 192.168.0.1 + pub exclude_private_ips: bool, + /// Example: 169.254.0.0 + pub exclude_link_local_ips: bool, + /// For IPv4: 127.0.0.1/8 + /// For IPv6: ::1/128 + pub exclude_loopback_ips: bool, + /// Example: octocat@github.com + pub exclude_mail: bool, } impl Filter { + #[inline] #[must_use] - pub fn new( - includes: Option, - excludes: Option, - scheme: Option, - ) -> Self { - Filter { - includes: includes.unwrap_or_default(), - excludes: excludes.unwrap_or_default(), - scheme, + pub fn is_mail_excluded(&self, uri: &Uri) -> bool { + uri.is_mail() && self.exclude_mail + } + + #[must_use] + pub fn is_ip_excluded(&self, uri: &Uri) -> bool { + match uri.host_ip() { + Some(ip_addr) if self.exclude_loopback_ips && ip_addr.is_loopback() => true, + // Note: in a pathological case, an IPv6 address can be IPv4-mapped + // (IPv4 address embedded in a IPv6). We purposefully + // don't deal with it here, and assume if an address is IPv6, + // we shouldn't attempt to map it to IPv4. + // See: https://tools.ietf.org/html/rfc4291#section-2.5.5.2 + Some(IpAddr::V4(v4_addr)) if self.exclude_private_ips && v4_addr.is_private() => true, + Some(IpAddr::V4(v4_addr)) if self.exclude_link_local_ips && v4_addr.is_link_local() => { + true + } + _ => false, } } + #[inline] + #[must_use] + pub fn is_scheme_excluded(&self, uri: &Uri) -> bool { + matches!(self.scheme, Some(ref scheme) if scheme != uri.scheme()) + } + + #[inline] + fn is_includes_empty(&self) -> bool { + !matches!(self.includes, Some(ref includes) if !includes.is_empty()) + } + + #[inline] + fn is_excludes_empty(&self) -> bool { + !matches!(self.excludes, Some(ref excludes) if !excludes.is_empty()) + } + + #[inline] + fn is_includes_match(&self, input: &str) -> bool { + matches!(self.includes, Some(ref includes) if includes.is_match(input)) + } + + #[inline] + fn is_excludes_match(&self, input: &str) -> bool { + matches!(self.excludes, Some(ref excludes) if excludes.is_match(input)) + } + + /// Determine whether a given [`Uri`] should be excluded. + /// + /// # Details + /// + /// 1. If any of the following conditions are met, the URI is excluded: + /// - If it's a mail address and it's configured to ignore mail addresses. + /// - If the IP address belongs to a type that is configured to exclude. + /// - If the scheme of URI is not the allowed scheme. + /// 2. Decide whether the URI is *presumably included* or *explicitly included*: + /// - When both excludes and includes rules are empty, it's *presumably included* unless + /// it's a known false positive. + /// - When the includes rules matches the URI, it's *explicitly included*. + /// 3. When it's a known *false positive* pattern, it's *explicitly excluded*. + /// 4. Decide whether the URI is *presumably excluded* or *explicitly excluded*: + /// - When excludes rules is empty, but includes rules doesn't match the URI, it's + /// *presumably excluded*. + /// - When the excludes rules matches the URI, it's *explicitly excluded*. + /// - When the excludes rules matches the URI, it's *explicitly excluded*. #[must_use] pub fn is_excluded(&self, uri: &Uri) -> bool { - // Skip mail? - if self.excludes.is_mail_excluded() && uri.scheme() == "mailto" { - return true; - } - // Skip specific IP address? - if self.excludes.ip(&uri) { + // Skip mail address, specific IP, and scheme + if self.is_mail_excluded(uri) || self.is_ip_excluded(uri) || self.is_scheme_excluded(uri) { return true; } let input = uri.as_str(); - if self.includes.is_empty() { - if self.excludes.is_empty() { - // No regex includes/excludes at all? - // Not excluded unless it's a known false positive - return Excludes::is_false_positive(input); + + if self.is_includes_empty() { + if self.is_excludes_empty() { + // Both excludes and includes rules are empty: + // *Presumably included* unless it's false positive + return is_false_positive(input); } - } else if self.includes.regex(input) { - // Included explicitly (Includes take precedence over excludes) + } else if self.is_includes_match(input) { + // *Explicitly included* (Includes take precedence over excludes) return false; } - // Exclude well-known false-positives. - // This is done after checking includes to allow for user-overwrites. - if Excludes::is_false_positive(uri.as_str()) { - return true; - } - if self.excludes.is_empty() { - if !self.includes.is_empty() { - // In case we have includes and no excludes, - // skip everything that was not included - return true; - } - } else if self.excludes.regex(input) { - // Excluded explicitly + + if is_false_positive(input) + // Exclude well-known false-positives + // Performed after checking includes to allow user-overwriddes + || self.is_excludes_empty() + // Previous checks imply input is not explicitly included, + // if excludes rules is empty, then *presumably excluded* + || self.is_excludes_match(input) + // If excludes rules matches input, then + // *explicitly excluded* + { return true; } - // URI scheme excluded? - matches!(self.scheme, Some(ref scheme) if scheme != uri.scheme()) + false } } @@ -94,21 +167,22 @@ mod test { const V4_LOOPBACK: &str = "http://127.0.0.1"; const V6_LOOPBACK: &str = "http://[::1]"; - const V4_LINK_LOCAL: &str = "http://169.254.0.1"; + const V4_LINK_LOCAL_1: &str = "http://169.254.0.1"; + const V4_LINK_LOCAL_2: &str = "http://169.254.10.1:8080"; // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; macro_rules! assert_ip_address { - (v4: $ip:expr, $predicate:tt) => {{ + (v4: $ip:expr, $predicate:tt) => { let res = if let Host::Ipv4(ipv4) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? { ipv4.$predicate() } else { false }; std::assert!(res); - }}; + }; (v6: $ip:expr, $predicate:tt) => { let res = if let Host::Ipv6(ipv6) = Url::parse($ip).map_err(|_| ())?.host().ok_or(())? { ipv6.$predicate() @@ -129,7 +203,8 @@ mod test { assert_ip_address!(v4: V4_LOOPBACK, is_loopback); assert_ip_address!(v6: V6_LOOPBACK, is_loopback); - assert_ip_address!(v4: V4_LINK_LOCAL, is_link_local); + assert_ip_address!(v4: V4_LINK_LOCAL_1, is_link_local); + assert_ip_address!(v4: V4_LINK_LOCAL_2, is_link_local); Ok(()) } @@ -154,10 +229,10 @@ mod test { #[test] fn test_overwrite_false_positives() { let includes = Includes { - regex: Some(RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap()), + regex: RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap(), }; let filter = Filter { - includes, + includes: Some(includes), ..Filter::default() }; assert!(!filter.is_excluded(&website("http://www.w3.org/1999/xhtml"))); @@ -166,10 +241,10 @@ mod test { #[test] fn test_include_regex() { let includes = Includes { - regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + regex: RegexSet::new(&[r"foo.example.org"]).unwrap(), }; let filter = Filter { - includes, + includes: Some(includes), ..Filter::default() }; @@ -181,12 +256,8 @@ mod test { #[test] fn test_exclude_mail() { - let excludes = Excludes { - mail: true, - ..Excludes::default() - }; let filter = Filter { - excludes, + exclude_mail: true, ..Filter::default() }; @@ -198,13 +269,10 @@ mod test { #[test] fn test_exclude_regex() { let excludes = Excludes { - regex: Some( - RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(), - ), - ..Excludes::default() + regex: RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(), }; let filter = Filter { - excludes, + excludes: Some(excludes), ..Filter::default() }; @@ -218,15 +286,14 @@ mod test { #[test] fn test_exclude_include_regex() { let includes = Includes { - regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + regex: RegexSet::new(&[r"foo.example.org"]).unwrap(), }; let excludes = Excludes { - regex: Some(RegexSet::new(&[r"example.org"]).unwrap()), - ..Excludes::default() + regex: RegexSet::new(&[r"example.org"]).unwrap(), }; let filter = Filter { - includes, - excludes, + includes: Some(includes), + excludes: Some(excludes), ..Filter::default() }; @@ -244,7 +311,8 @@ mod test { assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_A))); assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_B))); assert!(!filter.is_excluded(&website(V4_PRIVATE_CLASS_C))); - assert!(!filter.is_excluded(&website(V4_LINK_LOCAL))); + assert!(!filter.is_excluded(&website(V4_LINK_LOCAL_1))); + assert!(!filter.is_excluded(&website(V4_LINK_LOCAL_2))); assert!(!filter.is_excluded(&website(V4_LOOPBACK))); assert!(!filter.is_excluded(&website(V6_LOOPBACK))); } @@ -252,10 +320,7 @@ mod test { #[test] fn test_exclude_private_ips() { let filter = Filter { - excludes: Excludes { - private_ips: true, - ..Excludes::default() - }, + exclude_private_ips: true, ..Filter::default() }; @@ -267,23 +332,18 @@ mod test { #[test] fn test_exclude_link_local() { let filter = Filter { - excludes: Excludes { - link_local_ips: true, - ..Excludes::default() - }, + exclude_link_local_ips: true, ..Filter::default() }; - assert!(filter.is_excluded(&website(V4_LINK_LOCAL))); + assert!(filter.is_excluded(&website(V4_LINK_LOCAL_1))); + assert!(filter.is_excluded(&website(V4_LINK_LOCAL_2))); } #[test] fn test_exclude_loopback() { let filter = Filter { - excludes: Excludes { - loopback_ips: true, - ..Excludes::default() - }, + exclude_loopback_ips: true, ..Filter::default() }; @@ -294,11 +354,8 @@ mod test { #[test] fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { let filter = Filter { - excludes: Excludes { - private_ips: true, - link_local_ips: true, - ..Excludes::default() - }, + exclude_private_ips: true, + exclude_link_local_ips: true, ..Filter::default() }; diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index c665bb15cd..daa6043ce6 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -20,7 +20,7 @@ //! //! #[tokio::main] //! async fn main() -> Result<()> { -//! let client = ClientBuilder::default().build()?; +//! let client = ClientBuilder::default().client()?; //! let response = client.check("https://github.com/lycheeverse/lychee").await?; //! assert!(response.status().is_success()); //! Ok(()) @@ -63,6 +63,7 @@ use doc_comment as _; // required for doctest use openssl_sys as _; // required for vendored-openssl feature use ring as _; // required for apple silicon +#[doc(inline)] pub use crate::{ client::{check, ClientBuilder}, client_pool::ClientPool, diff --git a/lychee-lib/src/test_utils.rs b/lychee-lib/src/test_utils.rs index def682706a..2f2f78ceb2 100644 --- a/lychee-lib/src/test_utils.rs +++ b/lychee-lib/src/test_utils.rs @@ -21,7 +21,7 @@ where ErrorKind: From, { ClientBuilder::default() - .build() + .client() .unwrap() .check(request) .await diff --git a/lychee-lib/src/uri.rs b/lychee-lib/src/uri.rs index 92e51ad14e..fa7d082775 100644 --- a/lychee-lib/src/uri.rs +++ b/lychee-lib/src/uri.rs @@ -56,10 +56,7 @@ impl Uri { // TODO: Support GitLab etc. pub(crate) fn extract_github(&self) -> Option<(&str, &str)> { - debug_assert!( - !matches!(self.scheme(), "mailto"), - "Should only be called on a Website type!" - ); + debug_assert!(!self.is_mail(), "Should only be called on a Website type!"); // TODO: Support more patterns if matches!( @@ -74,6 +71,11 @@ impl Uri { None } + + #[inline] + pub(crate) fn is_mail(&self) -> bool { + self.scheme() == "mailto" + } } impl AsRef for Uri {