diff --git a/Cargo.lock b/Cargo.lock index b854ef2b..50df5256 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -390,7 +390,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.23.0" +version = "0.23.1" dependencies = [ "cssparser", "ego-tree", diff --git a/README.md b/README.md deleted file mode 100644 index 39450ec2..00000000 --- a/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# scraper - -[![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] -[![downloads](https://img.shields.io/crates/d/scraper)][crate] -[![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] - -HTML parsing and querying with CSS selectors. - -`scraper` is on [Crates.io][crate] and [GitHub][github]. - -[crate]: https://crates.io/crates/scraper -[github]: https://github.com/causal-agent/scraper -[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml - -Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. - -## Examples - -### Parsing a document - -```rust -use scraper::Html; - -let html = r#" - - - Hello, world! -

Hello, world!

-"#; - -let document = Html::parse_document(html); -``` - -### Parsing a fragment - -```rust -use scraper::Html; -let fragment = Html::parse_fragment("

Hello, world!

"); -``` - -### Parsing a selector - -```rust -use scraper::Selector; -let selector = Selector::parse("h1.foo").unwrap(); -``` - -### Selecting elements - -```rust -use scraper::{Html, Selector}; - -let html = r#" - -"#; - -let fragment = Html::parse_fragment(html); -let selector = Selector::parse("li").unwrap(); - -for element in fragment.select(&selector) { - assert_eq!("li", element.value().name()); -} -``` - -### Selecting descendent elements - -```rust -use scraper::{Html, Selector}; - -let html = r#" - -"#; - -let fragment = Html::parse_fragment(html); -let ul_selector = Selector::parse("ul").unwrap(); -let li_selector = Selector::parse("li").unwrap(); - -let ul = fragment.select(&ul_selector).next().unwrap(); -for element in ul.select(&li_selector) { - assert_eq!("li", element.value().name()); -} -``` - -### Accessing element attributes - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment(r#""#); -let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); - -let input = fragment.select(&selector).next().unwrap(); -assert_eq!(Some("bar"), input.value().attr("value")); -``` - -### Serializing HTML and inner HTML - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment("

Hello, world!

"); -let selector = Selector::parse("h1").unwrap(); - -let h1 = fragment.select(&selector).next().unwrap(); - -assert_eq!("

Hello, world!

", h1.html()); -assert_eq!("Hello, world!", h1.inner_html()); -``` - -### Accessing descendent text - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment("

Hello, world!

"); -let selector = Selector::parse("h1").unwrap(); - -let h1 = fragment.select(&selector).next().unwrap(); -let text = h1.text().collect::>(); - -assert_eq!(vec!["Hello, ", "world!"], text); -``` - -### Manipulating the DOM - -```rust -use html5ever::tree_builder::TreeSink; -use scraper::{Html, Selector}; - -let html = "hello

REMOVE ME

"; -let selector = Selector::parse(".hello").unwrap(); -let mut document = Html::parse_document(html); -let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); -for id in node_ids { - document.remove_from_parent(&id); -} -assert_eq!(document.html(), "hello"); -``` - -## Contributing - -Please feel free to open pull requests. If you're planning on implementing -something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) -then please open an issue first. diff --git a/README.md b/README.md new file mode 120000 index 00000000..a6541ddb --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +scraper/README.md \ No newline at end of file diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 6c53b45d..b88d6f49 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scraper" -version = "0.23.0" +version = "0.23.1" edition = "2021" description = "HTML parsing and querying with CSS selectors" diff --git a/scraper/README.md b/scraper/README.md index 32d46ee8..39450ec2 120000 --- a/scraper/README.md +++ b/scraper/README.md @@ -1 +1,152 @@ -../README.md \ No newline at end of file +# scraper + +[![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] +[![downloads](https://img.shields.io/crates/d/scraper)][crate] +[![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] + +HTML parsing and querying with CSS selectors. + +`scraper` is on [Crates.io][crate] and [GitHub][github]. + +[crate]: https://crates.io/crates/scraper +[github]: https://github.com/causal-agent/scraper +[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml + +Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. + +## Examples + +### Parsing a document + +```rust +use scraper::Html; + +let html = r#" + + + Hello, world! +

Hello, world!

+"#; + +let document = Html::parse_document(html); +``` + +### Parsing a fragment + +```rust +use scraper::Html; +let fragment = Html::parse_fragment("

Hello, world!

"); +``` + +### Parsing a selector + +```rust +use scraper::Selector; +let selector = Selector::parse("h1.foo").unwrap(); +``` + +### Selecting elements + +```rust +use scraper::{Html, Selector}; + +let html = r#" + +"#; + +let fragment = Html::parse_fragment(html); +let selector = Selector::parse("li").unwrap(); + +for element in fragment.select(&selector) { + assert_eq!("li", element.value().name()); +} +``` + +### Selecting descendent elements + +```rust +use scraper::{Html, Selector}; + +let html = r#" + +"#; + +let fragment = Html::parse_fragment(html); +let ul_selector = Selector::parse("ul").unwrap(); +let li_selector = Selector::parse("li").unwrap(); + +let ul = fragment.select(&ul_selector).next().unwrap(); +for element in ul.select(&li_selector) { + assert_eq!("li", element.value().name()); +} +``` + +### Accessing element attributes + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment(r#""#); +let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); + +let input = fragment.select(&selector).next().unwrap(); +assert_eq!(Some("bar"), input.value().attr("value")); +``` + +### Serializing HTML and inner HTML + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment("

Hello, world!

"); +let selector = Selector::parse("h1").unwrap(); + +let h1 = fragment.select(&selector).next().unwrap(); + +assert_eq!("

Hello, world!

", h1.html()); +assert_eq!("Hello, world!", h1.inner_html()); +``` + +### Accessing descendent text + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment("

Hello, world!

"); +let selector = Selector::parse("h1").unwrap(); + +let h1 = fragment.select(&selector).next().unwrap(); +let text = h1.text().collect::>(); + +assert_eq!(vec!["Hello, ", "world!"], text); +``` + +### Manipulating the DOM + +```rust +use html5ever::tree_builder::TreeSink; +use scraper::{Html, Selector}; + +let html = "hello

REMOVE ME

"; +let selector = Selector::parse(".hello").unwrap(); +let mut document = Html::parse_document(html); +let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); +for id in node_ids { + document.remove_from_parent(&id); +} +assert_eq!(document.html(), "hello"); +``` + +## Contributing + +Please feel free to open pull requests. If you're planning on implementing +something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) +then please open an issue first.