diff --git a/Cargo.lock b/Cargo.lock index 3a8e332b..1f37d003 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1395,7 +1395,7 @@ dependencies = [ "hyperlocal", "log", "pin-project-lite", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -1456,9 +1456,9 @@ dependencies = [ [[package]] name = "built" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c360505aed52b7ec96a3636c3f039d99103c37d1d9b4f7a8c743d3ea9ffcd03b" +checksum = "73848a43c5d63a1251d17adf6c2bf78aa94830e60a335a95eeea45d6ba9e1e4d" [[package]] name = "bumpalo" @@ -1511,9 +1511,9 @@ dependencies = [ [[package]] name = "bytesize" -version = "1.3.0" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc" +checksum = "2d2c12f985c78475a6b8d629afd0c360260ef34cfef52efccdcfd31972f81c2e" dependencies = [ "serde", ] @@ -1689,18 +1689,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.28" +version = "4.5.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e77c3243bd94243c03672cb5154667347c457ca271254724f9f393aee1c05ff" +checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.27" +version = "4.5.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7" +checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9" dependencies = [ "anstyle", "clap_lex", @@ -2068,9 +2068,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" dependencies = [ "memchr", ] @@ -2468,7 +2468,7 @@ dependencies = [ "itertools 0.13.0", "log", "paste", - "petgraph", + "petgraph 0.6.5", ] [[package]] @@ -3090,6 +3090,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "flatbuffers" version = "24.12.23" @@ -4168,7 +4174,7 @@ dependencies = [ "http 1.2.0", "hyper 1.6.0", "hyper-util", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", @@ -5586,9 +5592,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" +checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" dependencies = [ "adler2", "simd-adler32", @@ -6348,7 +6354,17 @@ version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ - "fixedbitset", + "fixedbitset 0.4.2", + "indexmap 2.7.1", +] + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset 0.5.7", "indexmap 2.7.1", ] @@ -6737,9 +6753,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", "prost-derive", @@ -6747,16 +6763,16 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck 0.5.0", - "itertools 0.13.0", + "itertools 0.14.0", "log", "multimap", "once_cell", - "petgraph", + "petgraph 0.7.1", "prettyplease", "prost", "prost-types", @@ -6767,12 +6783,12 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.98", @@ -6780,9 +6796,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ "prost", ] @@ -6913,7 +6929,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.1.1", - "rustls 0.23.22", + "rustls 0.23.23", "socket2 0.5.8", "thiserror 2.0.11", "tokio", @@ -6931,7 +6947,7 @@ dependencies = [ "rand 0.8.5", "ring", "rustc-hash 2.1.1", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-pki-types", "slab", "thiserror 2.0.11", @@ -7187,7 +7203,7 @@ dependencies = [ "num-bigint", "percent-encoding", "pin-project-lite", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.7.3", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -7332,7 +7348,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -7511,9 +7527,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.22" +version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" +checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ "log", "once_cell", @@ -8100,9 +8116,9 @@ dependencies = [ [[package]] name = "spider" -version = "2.27.50" +version = "2.27.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6777700924fadc27a4393fb7f25f7ae7f4cb48ad0a1ae236eab11e009f2eada6" +checksum = "d6723d18da46460baeb7163154d5c0743fa73a544f0c28a2043ade5718d3abe7" dependencies = [ "ahash", "aho-corasick", @@ -9432,7 +9448,7 @@ version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.22", + "rustls 0.23.23", "tokio", ] @@ -9514,9 +9530,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.23" +version = "0.22.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee" +checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474" dependencies = [ "indexmap 2.7.1", "serde", @@ -9966,7 +9982,7 @@ dependencies = [ "log", "native-tls", "once_cell", - "rustls 0.23.22", + "rustls 0.23.23", "rustls-pki-types", "serde", "serde_json", @@ -10586,9 +10602,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86e376c75f4f43f44db463cf729e0d3acbf954d13e22c51e26e4c264b4ab545f" +checksum = "59690dea168f2198d1a3b0cac23b8063efcd11012f10ae4698f284808c8ef603" dependencies = [ "memchr", ] diff --git a/swiftide-integrations/src/scraping/loader.rs b/swiftide-integrations/src/scraping/loader.rs index 509c57a1..1fc63270 100644 --- a/swiftide-integrations/src/scraping/loader.rs +++ b/swiftide-integrations/src/scraping/loader.rs @@ -1,8 +1,5 @@ -use std::sync::Arc; - use derive_builder::Builder; use spider::website::Website; -use tokio::{runtime::Handle, sync::RwLock}; use swiftide_core::{ indexing::{IndexingStream, Node}, @@ -16,7 +13,7 @@ use swiftide_core::{ /// Under the hood uses the `spider` crate to scrape the website. /// For more configuration options see their documentation. pub struct ScrapingLoader { - spider_website: Arc>, + spider_website: Website, } impl ScrapingLoader { @@ -27,9 +24,7 @@ impl ScrapingLoader { // Constructs a scrapingloader from a `spider::Website` configuration #[allow(dead_code)] pub fn from_spider(spider_website: Website) -> Self { - Self { - spider_website: Arc::new(RwLock::new(spider_website)), - } + Self { spider_website } } /// Constructs a scrapingloader from a given url @@ -39,17 +34,13 @@ impl ScrapingLoader { } impl Loader for ScrapingLoader { - fn into_stream(self) -> IndexingStream { - let (tx, rx) = std::sync::mpsc::channel(); - let mut spider_rx = tokio::task::block_in_place(|| { - Handle::current().block_on(async { - self.spider_website - .write() - .await - .subscribe(0) - .expect("Failed to subscribe to spider") - }) - }); + fn into_stream(mut self) -> IndexingStream { + let (tx, rx) = tokio::sync::mpsc::channel(1000); + let mut spider_rx = self + .spider_website + .subscribe(0) + .expect("Failed to subscribe to spider"); + tracing::info!("Subscribed to spider"); let _recv_thread = tokio::spawn(async move { while let Ok(res) = spider_rx.recv().await { @@ -62,23 +53,112 @@ impl Loader for ScrapingLoader { .path(res.get_url()) .build(); - if tx.send(node).is_err() { + tracing::debug!(?node, "[Spider] Received node from spider"); + + if let Err(error) = tx.send(node).await { + tracing::error!(?error, "[Spider] Failed to send node to stream"); break; } } }); + let mut spider_website = self.spider_website; + let _scrape_thread = tokio::spawn(async move { - let mut spider_website = self.spider_website.write().await; - spider_website.scrape().await; + tracing::info!("[Spider] Starting scrape loop"); + // TODO: It would be much nicer if this used `scrape` instead, as it is supposedly + // more concurrent + spider_website.crawl().await; + tracing::info!("[Spider] Scrape loop finished"); }); // NOTE: Handles should stay alive because of rx, but feels a bit fishy - - IndexingStream::iter(rx) + rx.into() } fn into_stream_boxed(self: Box) -> IndexingStream { self.into_stream() } } + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::Result; + use futures_util::StreamExt; + use swiftide_core::indexing::Loader; + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, Request, ResponseTemplate}; + + #[test_log::test(tokio::test(flavor = "multi_thread"))] + async fn test_scraping_loader_with_wiremock() { + // Set up the wiremock server to simulate the remote web server + let mock_server = MockServer::start().await; + + // Mocked response for the page we will scrape + let body = "

Test Page

"; + Mock::given(method("GET")) + .and(path("/")) + .respond_with(ResponseTemplate::new(200).set_body_string(body)) + .mount(&mock_server) + .await; + + // Create an instance of ScrapingLoader using the mock server's URL + let loader = ScrapingLoader::from_url(mock_server.uri()); + + // Execute the into_stream method + let stream = loader.into_stream(); + + // Process the stream to check if we get the expected result + let nodes = stream.collect::>>().await; + + assert_eq!(nodes.len(), 1); + + let first_node = nodes.first().unwrap().as_ref().unwrap(); + + assert_eq!(first_node.chunk, body); + } + + #[test_log::test(tokio::test(flavor = "multi_thread"))] + async fn test_scraping_loader_multiple_pages() { + // Set up the wiremock server to simulate the remote web server + let mock_server = MockServer::start().await; + + // Mocked response for the page we will scrape + let body = "

Test Page

link"; + Mock::given(method("GET")) + .and(path("/")) + .respond_with(ResponseTemplate::new(200).set_body_string(body)) + .mount(&mock_server) + .await; + + let body2 = "

Test Page 2

"; + Mock::given(method("GET")) + .and(path("/other")) + .respond_with(move |_req: &Request| { + std::thread::sleep(std::time::Duration::from_secs(1)); + ResponseTemplate::new(200).set_body_string(body2) + }) + .mount(&mock_server) + .await; + + // Create an instance of ScrapingLoader using the mock server's URL + let loader = ScrapingLoader::from_url(mock_server.uri()); + + // Execute the into_stream method + let stream = loader.into_stream(); + + // Process the stream to check if we get the expected result + let mut nodes = stream.collect::>>().await; + + assert_eq!(nodes.len(), 2); + + let first_node = nodes.pop().unwrap().unwrap(); + + assert_eq!(first_node.chunk, body2); + + let second_node = nodes.pop().unwrap().unwrap(); + + assert_eq!(second_node.chunk, body); + } +}