Skip to main content

cadmus_core/document/html/
xml.rs

1//! HTML and XML parsers that produce an [`XmlTree`].
2//!
3//! Two parsers are provided, each suited to a different use-case:
4//!
5//! - [`XmlParser`] — a hand-rolled recursive-descent parser. Node offsets are
6//!   exact byte positions of each token in the source string. Use this wherever
7//!   reading positions need to be persisted to disk (EPUB spine chapters,
8//!   standalone HTML files).
9//!
10//! - [`parse_html5`] — a thin wrapper around `html5ever`. Handles entities,
11//!   void elements, and the full HTML5 error-recovery algorithm. Node offsets
12//!   are **synthetic** (a monotonically increasing counter, not source
13//!   positions). Use this for ephemeral rendering where offset precision is
14//!   not required (e.g. the dictionary view).
15
16use super::dom::{element, text, whitespace, Attributes, NodeId, XmlTree};
17use fxhash::FxHashMap;
18use html5ever::tendril::{Tendril, TendrilSink};
19use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
20use html5ever::{Attribute, QualName};
21use std::cell::{Ref, RefCell};
22
23/// Extension trait that adds XML whitespace detection to [`char`].
24pub trait XmlExt {
25    /// Returns `true` for the four XML whitespace characters: space, tab,
26    /// carriage return, and newline.
27    fn is_xml_whitespace(&self) -> bool;
28}
29
30impl XmlExt for char {
31    fn is_xml_whitespace(&self) -> bool {
32        matches!(self, ' ' | '\t' | '\n' | '\r')
33    }
34}
35
36/// Hand-rolled recursive-descent parser for XML and basic HTML documents.
37///
38/// Produces an [`XmlTree`] where every node's `offset` field is the exact byte
39/// position of the opening `<` (elements) or first character (text nodes) in
40/// `input`. This byte-accuracy is required when reading positions are
41/// persisted across sessions.
42///
43/// The parser is intentionally lenient: unknown tags, processing instructions,
44/// and CDATA sections are skipped silently. Self-closing tags (`<br/>`,
45/// `<img/>`) are supported.
46#[derive(Debug)]
47pub struct XmlParser<'a> {
48    /// The full source string being parsed.
49    pub input: &'a str,
50    /// Current byte offset into `input`.
51    pub offset: usize,
52}
53
54impl<'a> XmlParser<'a> {
55    /// Creates a new parser positioned at the start of `input`.
56    pub fn new(input: &str) -> XmlParser<'_> {
57        XmlParser { input, offset: 0 }
58    }
59
60    /// Returns `true` when the cursor has reached the end of the input.
61    fn eof(&self) -> bool {
62        self.offset >= self.input.len()
63    }
64
65    /// Returns the next character without advancing the cursor.
66    fn next(&self) -> Option<char> {
67        self.input[self.offset..].chars().next()
68    }
69
70    /// Returns `true` if the remaining input starts with `s`.
71    fn starts_with(&self, s: &str) -> bool {
72        self.input[self.offset..].starts_with(s)
73    }
74
75    /// Advances the cursor by exactly `n` Unicode scalar values.
76    fn advance(&mut self, n: usize) {
77        for c in self.input[self.offset..].chars().take(n) {
78            self.offset += c.len_utf8();
79        }
80    }
81
82    /// Advances the cursor as long as `test` returns `true` for the next char.
83    fn advance_while<F>(&mut self, test: F)
84    where
85        F: FnMut(&char) -> bool,
86    {
87        for c in self.input[self.offset..].chars().take_while(test) {
88            self.offset += c.len_utf8();
89        }
90    }
91
92    /// Advances the cursor until `target` is found and consumes it.
93    /// Does nothing if `target` is never found before EOF.
94    fn advance_until(&mut self, target: &str) {
95        while !self.eof() && !self.starts_with(target) {
96            self.advance(1);
97        }
98        self.advance(target.chars().count());
99    }
100
101    /// Parses the attribute list of an open tag, stopping at `>` or `/`.
102    ///
103    /// Both single- and double-quoted attribute values are supported. The
104    /// cursor is left immediately before the closing `>` or `/`.
105    fn parse_attributes(&mut self) -> Attributes {
106        let mut attrs = FxHashMap::default();
107        while !self.eof() {
108            self.advance_while(|&c| c.is_xml_whitespace());
109            match self.next() {
110                Some('>') | Some('/') | None => break,
111                _ => {
112                    let offset = self.offset;
113                    self.advance_while(|&c| c != '=');
114                    let key = self.input[offset..self.offset].to_string();
115                    self.advance_while(|&c| c != '"' && c != '\'');
116                    let quote = self.next().unwrap_or('"');
117                    self.advance(1);
118                    let offset = self.offset;
119                    self.advance_while(|&c| c != quote);
120                    let value = self.input[offset..self.offset].to_string();
121                    attrs.insert(key, value);
122                    self.advance(1);
123                }
124            }
125        }
126        attrs
127    }
128
129    /// Parses a single element (tag name + attributes + children) and appends
130    /// it to `parent_id` in `tree`.
131    ///
132    /// The cursor must be positioned immediately after the opening `<` when
133    /// this function is called. After returning the cursor is positioned after
134    /// the element's closing tag.
135    fn parse_element(&mut self, tree: &mut XmlTree, parent_id: NodeId) {
136        let offset = self.offset;
137        self.advance_while(|&c| c != '>' && c != '/' && !c.is_xml_whitespace());
138        let name = &self.input[offset..self.offset];
139        let attributes = self.parse_attributes();
140
141        match self.next() {
142            Some('/') => {
143                self.advance(2);
144                tree.get_mut(parent_id)
145                    .append(element(name, offset - 1, attributes));
146            }
147            Some('>') => {
148                self.advance(1);
149                let id = tree
150                    .get_mut(parent_id)
151                    .append(element(name, offset - 1, attributes));
152                self.parse_nodes(tree, id);
153            }
154            _ => (),
155        }
156    }
157
158    /// Parses all child nodes of `parent_id` until a matching closing tag or
159    /// EOF is reached.
160    ///
161    /// Handles text nodes, whitespace, elements, processing instructions
162    /// (`<?…?>`), comments (`<!--…-->`), CDATA sections (`<![…]]>`), and
163    /// DOCTYPE declarations.
164    fn parse_nodes(&mut self, tree: &mut XmlTree, parent_id: NodeId) {
165        while !self.eof() {
166            let offset = self.offset;
167            self.advance_while(|&c| c.is_xml_whitespace());
168
169            match self.next() {
170                Some('<') => {
171                    if self.offset > offset {
172                        tree.get_mut(parent_id)
173                            .append(whitespace(&self.input[offset..self.offset], offset));
174                    }
175                    if self.starts_with("</") {
176                        self.advance(2);
177                        self.advance_while(|&c| c != '>');
178                        self.advance(1);
179                        break;
180                    }
181                    self.advance(1);
182                    match self.next() {
183                        Some('?') => {
184                            self.advance(1);
185                            self.advance_until("?>");
186                        }
187                        Some('!') => {
188                            self.advance(1);
189                            match self.next() {
190                                Some('-') => {
191                                    self.advance(2);
192                                    self.advance_until("-->");
193                                }
194                                Some('[') => {
195                                    self.advance(1);
196                                    self.advance_until("]]>");
197                                }
198                                _ => {
199                                    self.advance_while(|&c| c != '>');
200                                    self.advance(1);
201                                }
202                            }
203                        }
204                        _ => self.parse_element(tree, parent_id),
205                    }
206                }
207                Some(..) => {
208                    self.advance_while(|&c| c != '<');
209                    tree.get_mut(parent_id)
210                        .append(text(&self.input[offset..self.offset], offset));
211                }
212                None => break,
213            }
214        }
215    }
216
217    /// Parses `self.input` and returns the resulting [`XmlTree`].
218    ///
219    /// Every node's `offset` is the byte position of its opening `<` or first
220    /// text character within the original source string.
221    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(len = self.input.len())))]
222    pub fn parse(&mut self) -> XmlTree {
223        let mut tree = XmlTree::new();
224        self.parse_nodes(&mut tree, NodeId::from_index(0));
225        tree
226    }
227}
228
229/// [`TreeSink`] implementation that bridges html5ever's push-based API into
230/// an [`XmlTree`].
231///
232/// Node offsets are assigned from a monotonically increasing counter rather
233/// than from source byte positions, because html5ever's `TreeSink` callbacks
234/// do not receive source positions. The counter advances by 1 per element and
235/// by `text.len()` per text chunk, preserving the non-overlap invariant needed
236/// by the layout engine's page-finding binary search.
237struct Html5Sink {
238    /// The tree being built. `RefCell` is required because multiple `TreeSink`
239    /// methods need mutable access and Rust's borrow checker cannot see that
240    /// html5ever calls them non-concurrently.
241    tree: RefCell<XmlTree>,
242    /// Maps each element `NodeId` to its fully-qualified name so that
243    /// `elem_name` can return a borrowed reference as required by the trait.
244    qual_names: RefCell<FxHashMap<NodeId, QualName>>,
245    /// Maps `<template>` element `NodeId`s to their associated content root
246    /// `NodeId`, as required by the HTML5 template element spec.
247    template_contents: RefCell<FxHashMap<NodeId, NodeId>>,
248    /// Synthetic position counter. Incremented for every node created so that
249    /// all offsets are unique and ordered by document position.
250    offset_counter: RefCell<usize>,
251}
252
253impl Html5Sink {
254    /// Creates a new sink with an empty tree and a zeroed offset counter.
255    fn new() -> Self {
256        Html5Sink {
257            tree: RefCell::new(XmlTree::new()),
258            qual_names: RefCell::new(FxHashMap::default()),
259            template_contents: RefCell::new(FxHashMap::default()),
260            offset_counter: RefCell::new(0),
261        }
262    }
263
264    /// Returns the current value of the offset counter without advancing it.
265    fn next_offset(&self) -> usize {
266        *self.offset_counter.borrow()
267    }
268
269    /// Advances the offset counter by `by`, clamped to a minimum of 1 to
270    /// guarantee that every node receives a strictly larger offset than the
271    /// previous one even for zero-length text runs.
272    fn advance_offset(&self, by: usize) {
273        *self.offset_counter.borrow_mut() += by.max(1);
274    }
275
276    /// Returns `true` when `text` contains only ASCII whitespace characters.
277    fn is_whitespace_only(text: &str) -> bool {
278        text.chars().all(|c| c.is_ascii_whitespace())
279    }
280
281    /// Converts an html5ever [`Attribute`] name to its string representation,
282    /// prefixing with the namespace if one is present (e.g. `xml:lang`).
283    fn attr_name(attr: &Attribute) -> String {
284        match &attr.name.prefix {
285            Some(prefix) => format!("{}:{}", prefix.as_ref(), attr.name.local.as_ref()),
286            None => attr.name.local.as_ref().to_string(),
287        }
288    }
289
290    /// Converts a `Vec<Attribute>` from html5ever into the [`Attributes`] map
291    /// used by the DOM.
292    fn build_attributes(attrs: Vec<Attribute>) -> Attributes {
293        let mut attributes = Attributes::default();
294        for attr in attrs {
295            attributes.insert(Self::attr_name(&attr), attr.value.to_string());
296        }
297        attributes
298    }
299}
300
301impl TreeSink for Html5Sink {
302    type Handle = NodeId;
303    type Output = XmlTree;
304    type ElemName<'a> = Ref<'a, QualName>;
305
306    fn finish(self) -> Self::Output {
307        self.tree.into_inner()
308    }
309
310    /// Silently ignores all parse errors. The dictionary content from
311    /// reader-dict is often malformed HTML, and we rely on html5ever's
312    /// error-recovery rather than failing on bad input.
313    fn parse_error(&self, _msg: std::borrow::Cow<'static, str>) {}
314
315    fn get_document(&self) -> Self::Handle {
316        NodeId::from_index(0)
317    }
318
319    fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> Self::ElemName<'a> {
320        Ref::map(self.qual_names.borrow(), |names| {
321            names.get(target).expect("elem_name called on unknown node")
322        })
323    }
324
325    /// Creates a new element node, assigns it the next synthetic offset, and
326    /// registers its qualified name for later `elem_name` lookups.
327    ///
328    /// For `<template>` elements an additional content-root node is created
329    /// and stored in `template_contents`, as required by the spec.
330    fn create_element(
331        &self,
332        name: QualName,
333        attrs: Vec<Attribute>,
334        flags: ElementFlags,
335    ) -> Self::Handle {
336        let tag_name = name.local.as_ref();
337        let offset = self.next_offset();
338        self.advance_offset(1);
339        let attributes = Self::build_attributes(attrs);
340        let data = element(tag_name, offset, attributes);
341        let id = self.tree.borrow_mut().push_node(data);
342        self.qual_names.borrow_mut().insert(id, name.clone());
343
344        if flags.template {
345            let template_root_offset = self.next_offset();
346            self.advance_offset(1);
347            let template_root = element(
348                "template-contents",
349                template_root_offset,
350                Attributes::default(),
351            );
352            let template_id = self.tree.borrow_mut().push_node(template_root);
353            self.template_contents.borrow_mut().insert(id, template_id);
354        }
355
356        id
357    }
358
359    /// Maps an HTML comment to an empty whitespace node so it occupies a slot
360    /// in the offset space without contributing visible content.
361    fn create_comment(&self, _text: Tendril<html5ever::tendril::fmt::UTF8>) -> Self::Handle {
362        let offset = self.next_offset();
363        self.advance_offset(1);
364        let data = whitespace("", offset);
365        self.tree.borrow_mut().push_node(data)
366    }
367
368    /// Maps a processing instruction to an empty whitespace node so it
369    /// occupies a slot in the offset space without contributing visible
370    /// content.
371    fn create_pi(
372        &self,
373        _target: Tendril<html5ever::tendril::fmt::UTF8>,
374        _data: Tendril<html5ever::tendril::fmt::UTF8>,
375    ) -> Self::Handle {
376        let offset = self.next_offset();
377        self.advance_offset(1);
378        let data = whitespace("", offset);
379        self.tree.borrow_mut().push_node(data)
380    }
381
382    /// Appends a child node or text run to `parent`.
383    ///
384    /// Text runs are coalesced into the preceding sibling text node when one
385    /// exists, to match the behaviour of the hand-rolled parser and avoid
386    /// producing redundant nodes for adjacent text chunks.
387    fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
388        match child {
389            NodeOrText::AppendNode(node) => {
390                self.tree.borrow_mut().attach_child(*parent, node);
391            }
392            NodeOrText::AppendText(t) => {
393                let text_str = t.as_ref();
394                let last_child_id = self
395                    .tree
396                    .borrow()
397                    .get(*parent)
398                    .last_child()
399                    .filter(|n| n.tag_name().is_none() && !n.text().is_empty())
400                    .map(|n| n.id);
401
402                if let Some(last_id) = last_child_id {
403                    self.advance_offset(text_str.len());
404                    self.tree.borrow_mut().append_text_to(last_id, text_str);
405                } else {
406                    let offset = self.next_offset();
407                    self.advance_offset(text_str.len());
408                    let data = if Self::is_whitespace_only(text_str) {
409                        whitespace(text_str, offset)
410                    } else {
411                        text(text_str, offset)
412                    };
413                    let node_id = self.tree.borrow_mut().push_node(data);
414                    self.tree.borrow_mut().attach_child(*parent, node_id);
415                }
416            }
417        }
418    }
419
420    /// Delegates to [`Self::append`] using `element` as the target parent.
421    ///
422    /// Called by html5ever during foster-parenting and similar error-recovery
423    /// situations where the intended parent is determined by the element rather
424    /// than its previous sibling.
425    fn append_based_on_parent_node(
426        &self,
427        element: &Self::Handle,
428        prev_element: &Self::Handle,
429        child: NodeOrText<Self::Handle>,
430    ) {
431        let has_parent = self.tree.borrow().get(*element).parent().is_some();
432        if has_parent {
433            self.append_before_sibling(element, child);
434        } else {
435            self.append(prev_element, child);
436        }
437    }
438
439    /// Inserts a node or text run immediately before `sibling`.
440    fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
441        match new_node {
442            NodeOrText::AppendNode(node) => {
443                self.tree.borrow_mut().insert_before(*sibling, node);
444            }
445            NodeOrText::AppendText(t) => {
446                let text_str = t.as_ref();
447                let offset = self.next_offset();
448                self.advance_offset(text_str.len());
449                let data = if Self::is_whitespace_only(text_str) {
450                    whitespace(text_str, offset)
451                } else {
452                    text(text_str, offset)
453                };
454                let node_id = self.tree.borrow_mut().push_node(data);
455                self.tree.borrow_mut().insert_before(*sibling, node_id);
456            }
457        }
458    }
459
460    /// DOCTYPE declarations are not represented in the tree.
461    fn append_doctype_to_document(
462        &self,
463        _name: Tendril<html5ever::tendril::fmt::UTF8>,
464        _public_id: Tendril<html5ever::tendril::fmt::UTF8>,
465        _system_id: Tendril<html5ever::tendril::fmt::UTF8>,
466    ) {
467    }
468
469    fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
470        *self
471            .template_contents
472            .borrow()
473            .get(target)
474            .expect("template contents not registered")
475    }
476
477    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
478        x == y
479    }
480
481    /// Quirks mode is accepted but has no effect on the tree representation.
482    fn set_quirks_mode(&self, _mode: QuirksMode) {}
483
484    fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
485        let mut tree = self.tree.borrow_mut();
486        for attr in attrs {
487            tree.add_attr_if_missing(*target, &Self::attr_name(&attr), &attr.value);
488        }
489    }
490
491    fn remove_from_parent(&self, target: &Self::Handle) {
492        self.tree.borrow_mut().detach(*target);
493    }
494
495    fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
496        let children: Vec<NodeId> = self
497            .tree
498            .borrow()
499            .get(*node)
500            .children()
501            .map(|c| c.id)
502            .collect();
503        for child in children {
504            self.tree.borrow_mut().detach(child);
505            self.tree.borrow_mut().attach_child(*new_parent, child);
506        }
507    }
508}
509
510/// Parses `input` as HTML using the html5ever spec-compliant parser and
511/// returns the resulting [`XmlTree`].
512///
513/// Compared to [`XmlParser`] this handles the full range of HTML5 content
514/// correctly:
515///
516/// - Named and numeric entities (`&amp;`, `&#160;`, …) are decoded.
517/// - Void elements (`<br>`, `<img>`, `<input>`, …) are never given children.
518/// - Implicitly-closed block tags (`<p>`, `<li>`, …) are auto-closed per spec.
519/// - Unclosed tags at EOF are closed automatically.
520///
521/// **Offset semantics:** node offsets are synthetic (a monotonically
522/// increasing counter) and are **not** byte positions in the source string.
523/// This makes the tree unsuitable for persisting reading positions to disk.
524/// Use [`XmlParser`] when byte-accurate offsets are required.
525#[cfg_attr(feature = "tracing", tracing::instrument(skip(input), fields(len = input.len())))]
526pub fn parse_html5(input: &str) -> XmlTree {
527    use html5ever::{parse_document, ParseOpts};
528
529    let parser = parse_document(Html5Sink::new(), ParseOpts::default());
530    let input_tendril: Tendril<html5ever::tendril::fmt::UTF8> = input.into();
531    let mut tree = parser.one(input_tendril);
532    tree.wrap_lost_inlines();
533    tree
534}
535
536#[cfg(test)]
537mod tests {
538    use super::*;
539
540    #[test]
541    fn test_simple_element() {
542        let text = "<a/>";
543        let xml = XmlParser::new(text).parse();
544        let n = xml.root().first_child().unwrap();
545        assert_eq!(n.offset(), 0);
546        assert_eq!(n.tag_name(), Some("a"));
547    }
548
549    #[test]
550    fn test_attributes() {
551        let text = r#"<a b="c" d='e"'/>"#;
552        let xml = XmlParser::new(text).parse();
553        let n = xml.root().first_child().unwrap();
554        assert_eq!(n.attribute("b"), Some("c"));
555        assert_eq!(n.attribute("d"), Some("e\""));
556    }
557
558    #[test]
559    fn test_text() {
560        let text = "<a>bcd</a>";
561        let xml = XmlParser::new(text).parse();
562        let child = xml.root().first_child().unwrap().children().next();
563        assert_eq!(child.map(|c| c.offset()), Some(3));
564        assert_eq!(child.map(|c| c.text()), Some("bcd".to_string()));
565    }
566
567    #[test]
568    fn test_inbetween_space() {
569        let text = "<a><b>x</b> <c>y</c></a>";
570        let xml = XmlParser::new(text).parse();
571        let child = xml.root().first_child().unwrap().children().nth(1);
572        assert_eq!(child.map(|c| c.text()), Some(" ".to_string()));
573    }
574
575    #[test]
576    fn test_central_space() {
577        let text = "<a><b> </b></a>";
578        let xml = XmlParser::new(text).parse();
579        assert_eq!(xml.root().text(), " ");
580    }
581
582    #[test]
583    fn html5_void_element() {
584        let text = "<br>";
585        let xml = parse_html5(text);
586        assert!(xml.root().find("br").is_some());
587    }
588
589    #[test]
590    fn html5_entity_decoding() {
591        let text = "<p>hello&amp;world</p>";
592        let xml = parse_html5(text);
593        let p = xml.root().find("p").unwrap();
594        assert_eq!(p.text(), "hello&world");
595    }
596
597    #[test]
598    fn html5_unclosed_p_tags() {
599        let text = "<p>first<p>second";
600        let xml = parse_html5(text);
601        let count = xml
602            .root()
603            .descendants()
604            .filter(|n| n.tag_name() == Some("p"))
605            .count();
606        assert_eq!(count, 2);
607    }
608
609    #[test]
610    fn html5_nested_ol_in_ol() {
611        let text =
612            r#"<ol><li>top</li><ol style="list-style-type:lower-alpha"><li>sub</li></ol></ol>"#;
613        let xml = parse_html5(text);
614        let inner_ol = xml
615            .root()
616            .descendants()
617            .find(|n| n.tag_name() == Some("ol") && n.attribute("style").is_some());
618        assert!(
619            inner_ol.is_some(),
620            "inner <ol> with style should exist in the tree"
621        );
622        assert_eq!(
623            inner_ol.unwrap().attribute("style"),
624            Some("list-style-type:lower-alpha")
625        );
626    }
627
628    #[test]
629    fn html5_comment_does_not_coalesce_following_text() {
630        let text = "<p>Hello<!-- comment -->World</p>";
631        let xml = parse_html5(text);
632
633        let p = xml.root().find("p").expect("p should exist");
634        let children: Vec<_> = p.children().collect();
635
636        assert_eq!(
637            children.len(),
638            3,
639            "p should have 3 children: text, comment placeholder, text"
640        );
641
642        let text_nodes: Vec<_> = children
643            .iter()
644            .filter(|n| !n.text().is_empty())
645            .map(|n| n.text())
646            .collect();
647
648        assert!(
649            text_nodes.contains(&"Hello".to_string()),
650            "text 'Hello' should exist as separate node"
651        );
652        assert!(
653            text_nodes.contains(&"World".to_string()),
654            "text 'World' should exist as separate node, not coalesced into comment node"
655        );
656
657        let comment_node = children
658            .iter()
659            .find(|n| n.text().is_empty() && n.tag_name().is_none());
660        assert!(
661            comment_node.is_some(),
662            "empty whitespace node (comment placeholder) should exist"
663        );
664    }
665
666    #[test]
667    fn html5_pi_does_not_coalesce_following_text() {
668        let text = "<p>Hello<?target data?>World</p>";
669        let xml = parse_html5(text);
670
671        let p = xml.root().find("p").expect("p should exist");
672        let children: Vec<_> = p.children().collect();
673
674        assert_eq!(
675            children.len(),
676            3,
677            "p should have 3 children: text, pi placeholder, text"
678        );
679
680        let text_nodes: Vec<_> = children
681            .iter()
682            .filter(|n| !n.text().is_empty())
683            .map(|n| n.text())
684            .collect();
685
686        assert!(
687            text_nodes.contains(&"Hello".to_string()),
688            "text 'Hello' should exist as separate node"
689        );
690        assert!(
691            text_nodes.contains(&"World".to_string()),
692            "text 'World' should exist as separate node, not coalesced into pi node"
693        );
694    }
695
696    #[test]
697    fn html5_text_node_offsets_do_not_overlap() {
698        let text = "<p><em>Cadmus</em> is a document reader for <em>Kobo</em>'s e-readers.</p>";
699        let xml = parse_html5(text);
700
701        let mut text_nodes: Vec<(usize, usize)> = xml
702            .root()
703            .descendants()
704            .filter(|n| n.tag_name().is_none())
705            .map(|n| (n.offset(), n.text().len()))
706            .filter(|(_, len)| *len > 0)
707            .collect();
708
709        text_nodes.sort_by_key(|(offset, _)| *offset);
710
711        for window in text_nodes.windows(2) {
712            let (offset_a, len_a) = window[0];
713            let (offset_b, _) = window[1];
714            assert!(
715                offset_a + len_a <= offset_b,
716                "text node at offset {} with len {} overlaps next node at offset {}",
717                offset_a,
718                len_a,
719                offset_b
720            );
721        }
722    }
723}