cadmus_core/document/html/
xml.rs1use super::dom::{element, text, whitespace, Attributes, NodeId, XmlTree};
17use fxhash::FxHashMap;
18use html5ever::tendril::{Tendril, TendrilSink};
19use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
20use html5ever::{Attribute, QualName};
21use std::cell::{Ref, RefCell};
22
23pub trait XmlExt {
25 fn is_xml_whitespace(&self) -> bool;
28}
29
30impl XmlExt for char {
31 fn is_xml_whitespace(&self) -> bool {
32 matches!(self, ' ' | '\t' | '\n' | '\r')
33 }
34}
35
36#[derive(Debug)]
47pub struct XmlParser<'a> {
48 pub input: &'a str,
50 pub offset: usize,
52}
53
54impl<'a> XmlParser<'a> {
55 pub fn new(input: &str) -> XmlParser<'_> {
57 XmlParser { input, offset: 0 }
58 }
59
60 fn eof(&self) -> bool {
62 self.offset >= self.input.len()
63 }
64
65 fn next(&self) -> Option<char> {
67 self.input[self.offset..].chars().next()
68 }
69
70 fn starts_with(&self, s: &str) -> bool {
72 self.input[self.offset..].starts_with(s)
73 }
74
75 fn advance(&mut self, n: usize) {
77 for c in self.input[self.offset..].chars().take(n) {
78 self.offset += c.len_utf8();
79 }
80 }
81
82 fn advance_while<F>(&mut self, test: F)
84 where
85 F: FnMut(&char) -> bool,
86 {
87 for c in self.input[self.offset..].chars().take_while(test) {
88 self.offset += c.len_utf8();
89 }
90 }
91
92 fn advance_until(&mut self, target: &str) {
95 while !self.eof() && !self.starts_with(target) {
96 self.advance(1);
97 }
98 self.advance(target.chars().count());
99 }
100
101 fn parse_attributes(&mut self) -> Attributes {
106 let mut attrs = FxHashMap::default();
107 while !self.eof() {
108 self.advance_while(|&c| c.is_xml_whitespace());
109 match self.next() {
110 Some('>') | Some('/') | None => break,
111 _ => {
112 let offset = self.offset;
113 self.advance_while(|&c| c != '=');
114 let key = self.input[offset..self.offset].to_string();
115 self.advance_while(|&c| c != '"' && c != '\'');
116 let quote = self.next().unwrap_or('"');
117 self.advance(1);
118 let offset = self.offset;
119 self.advance_while(|&c| c != quote);
120 let value = self.input[offset..self.offset].to_string();
121 attrs.insert(key, value);
122 self.advance(1);
123 }
124 }
125 }
126 attrs
127 }
128
129 fn parse_element(&mut self, tree: &mut XmlTree, parent_id: NodeId) {
136 let offset = self.offset;
137 self.advance_while(|&c| c != '>' && c != '/' && !c.is_xml_whitespace());
138 let name = &self.input[offset..self.offset];
139 let attributes = self.parse_attributes();
140
141 match self.next() {
142 Some('/') => {
143 self.advance(2);
144 tree.get_mut(parent_id)
145 .append(element(name, offset - 1, attributes));
146 }
147 Some('>') => {
148 self.advance(1);
149 let id = tree
150 .get_mut(parent_id)
151 .append(element(name, offset - 1, attributes));
152 self.parse_nodes(tree, id);
153 }
154 _ => (),
155 }
156 }
157
158 fn parse_nodes(&mut self, tree: &mut XmlTree, parent_id: NodeId) {
165 while !self.eof() {
166 let offset = self.offset;
167 self.advance_while(|&c| c.is_xml_whitespace());
168
169 match self.next() {
170 Some('<') => {
171 if self.offset > offset {
172 tree.get_mut(parent_id)
173 .append(whitespace(&self.input[offset..self.offset], offset));
174 }
175 if self.starts_with("</") {
176 self.advance(2);
177 self.advance_while(|&c| c != '>');
178 self.advance(1);
179 break;
180 }
181 self.advance(1);
182 match self.next() {
183 Some('?') => {
184 self.advance(1);
185 self.advance_until("?>");
186 }
187 Some('!') => {
188 self.advance(1);
189 match self.next() {
190 Some('-') => {
191 self.advance(2);
192 self.advance_until("-->");
193 }
194 Some('[') => {
195 self.advance(1);
196 self.advance_until("]]>");
197 }
198 _ => {
199 self.advance_while(|&c| c != '>');
200 self.advance(1);
201 }
202 }
203 }
204 _ => self.parse_element(tree, parent_id),
205 }
206 }
207 Some(..) => {
208 self.advance_while(|&c| c != '<');
209 tree.get_mut(parent_id)
210 .append(text(&self.input[offset..self.offset], offset));
211 }
212 None => break,
213 }
214 }
215 }
216
217 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(len = self.input.len())))]
222 pub fn parse(&mut self) -> XmlTree {
223 let mut tree = XmlTree::new();
224 self.parse_nodes(&mut tree, NodeId::from_index(0));
225 tree
226 }
227}
228
229struct Html5Sink {
238 tree: RefCell<XmlTree>,
242 qual_names: RefCell<FxHashMap<NodeId, QualName>>,
245 template_contents: RefCell<FxHashMap<NodeId, NodeId>>,
248 offset_counter: RefCell<usize>,
251}
252
253impl Html5Sink {
254 fn new() -> Self {
256 Html5Sink {
257 tree: RefCell::new(XmlTree::new()),
258 qual_names: RefCell::new(FxHashMap::default()),
259 template_contents: RefCell::new(FxHashMap::default()),
260 offset_counter: RefCell::new(0),
261 }
262 }
263
264 fn next_offset(&self) -> usize {
266 *self.offset_counter.borrow()
267 }
268
269 fn advance_offset(&self, by: usize) {
273 *self.offset_counter.borrow_mut() += by.max(1);
274 }
275
276 fn is_whitespace_only(text: &str) -> bool {
278 text.chars().all(|c| c.is_ascii_whitespace())
279 }
280
281 fn attr_name(attr: &Attribute) -> String {
284 match &attr.name.prefix {
285 Some(prefix) => format!("{}:{}", prefix.as_ref(), attr.name.local.as_ref()),
286 None => attr.name.local.as_ref().to_string(),
287 }
288 }
289
290 fn build_attributes(attrs: Vec<Attribute>) -> Attributes {
293 let mut attributes = Attributes::default();
294 for attr in attrs {
295 attributes.insert(Self::attr_name(&attr), attr.value.to_string());
296 }
297 attributes
298 }
299}
300
301impl TreeSink for Html5Sink {
302 type Handle = NodeId;
303 type Output = XmlTree;
304 type ElemName<'a> = Ref<'a, QualName>;
305
306 fn finish(self) -> Self::Output {
307 self.tree.into_inner()
308 }
309
310 fn parse_error(&self, _msg: std::borrow::Cow<'static, str>) {}
314
315 fn get_document(&self) -> Self::Handle {
316 NodeId::from_index(0)
317 }
318
319 fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> Self::ElemName<'a> {
320 Ref::map(self.qual_names.borrow(), |names| {
321 names.get(target).expect("elem_name called on unknown node")
322 })
323 }
324
325 fn create_element(
331 &self,
332 name: QualName,
333 attrs: Vec<Attribute>,
334 flags: ElementFlags,
335 ) -> Self::Handle {
336 let tag_name = name.local.as_ref();
337 let offset = self.next_offset();
338 self.advance_offset(1);
339 let attributes = Self::build_attributes(attrs);
340 let data = element(tag_name, offset, attributes);
341 let id = self.tree.borrow_mut().push_node(data);
342 self.qual_names.borrow_mut().insert(id, name.clone());
343
344 if flags.template {
345 let template_root_offset = self.next_offset();
346 self.advance_offset(1);
347 let template_root = element(
348 "template-contents",
349 template_root_offset,
350 Attributes::default(),
351 );
352 let template_id = self.tree.borrow_mut().push_node(template_root);
353 self.template_contents.borrow_mut().insert(id, template_id);
354 }
355
356 id
357 }
358
359 fn create_comment(&self, _text: Tendril<html5ever::tendril::fmt::UTF8>) -> Self::Handle {
362 let offset = self.next_offset();
363 self.advance_offset(1);
364 let data = whitespace("", offset);
365 self.tree.borrow_mut().push_node(data)
366 }
367
368 fn create_pi(
372 &self,
373 _target: Tendril<html5ever::tendril::fmt::UTF8>,
374 _data: Tendril<html5ever::tendril::fmt::UTF8>,
375 ) -> Self::Handle {
376 let offset = self.next_offset();
377 self.advance_offset(1);
378 let data = whitespace("", offset);
379 self.tree.borrow_mut().push_node(data)
380 }
381
382 fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
388 match child {
389 NodeOrText::AppendNode(node) => {
390 self.tree.borrow_mut().attach_child(*parent, node);
391 }
392 NodeOrText::AppendText(t) => {
393 let text_str = t.as_ref();
394 let last_child_id = self
395 .tree
396 .borrow()
397 .get(*parent)
398 .last_child()
399 .filter(|n| n.tag_name().is_none() && !n.text().is_empty())
400 .map(|n| n.id);
401
402 if let Some(last_id) = last_child_id {
403 self.advance_offset(text_str.len());
404 self.tree.borrow_mut().append_text_to(last_id, text_str);
405 } else {
406 let offset = self.next_offset();
407 self.advance_offset(text_str.len());
408 let data = if Self::is_whitespace_only(text_str) {
409 whitespace(text_str, offset)
410 } else {
411 text(text_str, offset)
412 };
413 let node_id = self.tree.borrow_mut().push_node(data);
414 self.tree.borrow_mut().attach_child(*parent, node_id);
415 }
416 }
417 }
418 }
419
420 fn append_based_on_parent_node(
426 &self,
427 element: &Self::Handle,
428 prev_element: &Self::Handle,
429 child: NodeOrText<Self::Handle>,
430 ) {
431 let has_parent = self.tree.borrow().get(*element).parent().is_some();
432 if has_parent {
433 self.append_before_sibling(element, child);
434 } else {
435 self.append(prev_element, child);
436 }
437 }
438
439 fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
441 match new_node {
442 NodeOrText::AppendNode(node) => {
443 self.tree.borrow_mut().insert_before(*sibling, node);
444 }
445 NodeOrText::AppendText(t) => {
446 let text_str = t.as_ref();
447 let offset = self.next_offset();
448 self.advance_offset(text_str.len());
449 let data = if Self::is_whitespace_only(text_str) {
450 whitespace(text_str, offset)
451 } else {
452 text(text_str, offset)
453 };
454 let node_id = self.tree.borrow_mut().push_node(data);
455 self.tree.borrow_mut().insert_before(*sibling, node_id);
456 }
457 }
458 }
459
460 fn append_doctype_to_document(
462 &self,
463 _name: Tendril<html5ever::tendril::fmt::UTF8>,
464 _public_id: Tendril<html5ever::tendril::fmt::UTF8>,
465 _system_id: Tendril<html5ever::tendril::fmt::UTF8>,
466 ) {
467 }
468
469 fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
470 *self
471 .template_contents
472 .borrow()
473 .get(target)
474 .expect("template contents not registered")
475 }
476
477 fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
478 x == y
479 }
480
481 fn set_quirks_mode(&self, _mode: QuirksMode) {}
483
484 fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
485 let mut tree = self.tree.borrow_mut();
486 for attr in attrs {
487 tree.add_attr_if_missing(*target, &Self::attr_name(&attr), &attr.value);
488 }
489 }
490
491 fn remove_from_parent(&self, target: &Self::Handle) {
492 self.tree.borrow_mut().detach(*target);
493 }
494
495 fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
496 let children: Vec<NodeId> = self
497 .tree
498 .borrow()
499 .get(*node)
500 .children()
501 .map(|c| c.id)
502 .collect();
503 for child in children {
504 self.tree.borrow_mut().detach(child);
505 self.tree.borrow_mut().attach_child(*new_parent, child);
506 }
507 }
508}
509
510#[cfg_attr(feature = "tracing", tracing::instrument(skip(input), fields(len = input.len())))]
526pub fn parse_html5(input: &str) -> XmlTree {
527 use html5ever::{parse_document, ParseOpts};
528
529 let parser = parse_document(Html5Sink::new(), ParseOpts::default());
530 let input_tendril: Tendril<html5ever::tendril::fmt::UTF8> = input.into();
531 let mut tree = parser.one(input_tendril);
532 tree.wrap_lost_inlines();
533 tree
534}
535
536#[cfg(test)]
537mod tests {
538 use super::*;
539
540 #[test]
541 fn test_simple_element() {
542 let text = "<a/>";
543 let xml = XmlParser::new(text).parse();
544 let n = xml.root().first_child().unwrap();
545 assert_eq!(n.offset(), 0);
546 assert_eq!(n.tag_name(), Some("a"));
547 }
548
549 #[test]
550 fn test_attributes() {
551 let text = r#"<a b="c" d='e"'/>"#;
552 let xml = XmlParser::new(text).parse();
553 let n = xml.root().first_child().unwrap();
554 assert_eq!(n.attribute("b"), Some("c"));
555 assert_eq!(n.attribute("d"), Some("e\""));
556 }
557
558 #[test]
559 fn test_text() {
560 let text = "<a>bcd</a>";
561 let xml = XmlParser::new(text).parse();
562 let child = xml.root().first_child().unwrap().children().next();
563 assert_eq!(child.map(|c| c.offset()), Some(3));
564 assert_eq!(child.map(|c| c.text()), Some("bcd".to_string()));
565 }
566
567 #[test]
568 fn test_inbetween_space() {
569 let text = "<a><b>x</b> <c>y</c></a>";
570 let xml = XmlParser::new(text).parse();
571 let child = xml.root().first_child().unwrap().children().nth(1);
572 assert_eq!(child.map(|c| c.text()), Some(" ".to_string()));
573 }
574
575 #[test]
576 fn test_central_space() {
577 let text = "<a><b> </b></a>";
578 let xml = XmlParser::new(text).parse();
579 assert_eq!(xml.root().text(), " ");
580 }
581
582 #[test]
583 fn html5_void_element() {
584 let text = "<br>";
585 let xml = parse_html5(text);
586 assert!(xml.root().find("br").is_some());
587 }
588
589 #[test]
590 fn html5_entity_decoding() {
591 let text = "<p>hello&world</p>";
592 let xml = parse_html5(text);
593 let p = xml.root().find("p").unwrap();
594 assert_eq!(p.text(), "hello&world");
595 }
596
597 #[test]
598 fn html5_unclosed_p_tags() {
599 let text = "<p>first<p>second";
600 let xml = parse_html5(text);
601 let count = xml
602 .root()
603 .descendants()
604 .filter(|n| n.tag_name() == Some("p"))
605 .count();
606 assert_eq!(count, 2);
607 }
608
609 #[test]
610 fn html5_nested_ol_in_ol() {
611 let text =
612 r#"<ol><li>top</li><ol style="list-style-type:lower-alpha"><li>sub</li></ol></ol>"#;
613 let xml = parse_html5(text);
614 let inner_ol = xml
615 .root()
616 .descendants()
617 .find(|n| n.tag_name() == Some("ol") && n.attribute("style").is_some());
618 assert!(
619 inner_ol.is_some(),
620 "inner <ol> with style should exist in the tree"
621 );
622 assert_eq!(
623 inner_ol.unwrap().attribute("style"),
624 Some("list-style-type:lower-alpha")
625 );
626 }
627
628 #[test]
629 fn html5_comment_does_not_coalesce_following_text() {
630 let text = "<p>Hello<!-- comment -->World</p>";
631 let xml = parse_html5(text);
632
633 let p = xml.root().find("p").expect("p should exist");
634 let children: Vec<_> = p.children().collect();
635
636 assert_eq!(
637 children.len(),
638 3,
639 "p should have 3 children: text, comment placeholder, text"
640 );
641
642 let text_nodes: Vec<_> = children
643 .iter()
644 .filter(|n| !n.text().is_empty())
645 .map(|n| n.text())
646 .collect();
647
648 assert!(
649 text_nodes.contains(&"Hello".to_string()),
650 "text 'Hello' should exist as separate node"
651 );
652 assert!(
653 text_nodes.contains(&"World".to_string()),
654 "text 'World' should exist as separate node, not coalesced into comment node"
655 );
656
657 let comment_node = children
658 .iter()
659 .find(|n| n.text().is_empty() && n.tag_name().is_none());
660 assert!(
661 comment_node.is_some(),
662 "empty whitespace node (comment placeholder) should exist"
663 );
664 }
665
666 #[test]
667 fn html5_pi_does_not_coalesce_following_text() {
668 let text = "<p>Hello<?target data?>World</p>";
669 let xml = parse_html5(text);
670
671 let p = xml.root().find("p").expect("p should exist");
672 let children: Vec<_> = p.children().collect();
673
674 assert_eq!(
675 children.len(),
676 3,
677 "p should have 3 children: text, pi placeholder, text"
678 );
679
680 let text_nodes: Vec<_> = children
681 .iter()
682 .filter(|n| !n.text().is_empty())
683 .map(|n| n.text())
684 .collect();
685
686 assert!(
687 text_nodes.contains(&"Hello".to_string()),
688 "text 'Hello' should exist as separate node"
689 );
690 assert!(
691 text_nodes.contains(&"World".to_string()),
692 "text 'World' should exist as separate node, not coalesced into pi node"
693 );
694 }
695
696 #[test]
697 fn html5_text_node_offsets_do_not_overlap() {
698 let text = "<p><em>Cadmus</em> is a document reader for <em>Kobo</em>'s e-readers.</p>";
699 let xml = parse_html5(text);
700
701 let mut text_nodes: Vec<(usize, usize)> = xml
702 .root()
703 .descendants()
704 .filter(|n| n.tag_name().is_none())
705 .map(|n| (n.offset(), n.text().len()))
706 .filter(|(_, len)| *len > 0)
707 .collect();
708
709 text_nodes.sort_by_key(|(offset, _)| *offset);
710
711 for window in text_nodes.windows(2) {
712 let (offset_a, len_a) = window[0];
713 let (offset_b, _) = window[1];
714 assert!(
715 offset_a + len_a <= offset_b,
716 "text node at offset {} with len {} overlaps next node at offset {}",
717 offset_a,
718 len_a,
719 offset_b
720 );
721 }
722 }
723}