Skip to main content

cadmus_core/document/html/
dom.rs

1use fxhash::{FxHashMap, FxHashSet};
2use std::num::NonZeroUsize;
3
4pub type Attributes = FxHashMap<String, String>;
5pub const WRAPPER_TAG_NAME: &str = "anonymous";
6
7#[derive(Debug, Clone)]
8pub enum NodeData {
9    Root,
10    Wrapper(usize),
11    Element(ElementData),
12    Text(TextData),
13    Whitespace(TextData),
14}
15
16#[derive(Debug, Clone)]
17pub struct ElementData {
18    pub offset: usize,
19    pub name: String,
20    pub qualified_name: Option<String>,
21    pub attributes: Attributes,
22}
23
24impl ElementData {
25    fn is_block(&self) -> bool {
26        matches!(
27            self.name.as_str(),
28            "address"
29                | "article"
30                | "aside"
31                | "blockquote"
32                | "body"
33                | "head"
34                | "details"
35                | "dialog"
36                | "dd"
37                | "div"
38                | "dl"
39                | "dt"
40                | "fieldset"
41                | "figcaption"
42                | "figure"
43                | "footer"
44                | "form"
45                | "h1"
46                | "h2"
47                | "h3"
48                | "h4"
49                | "h5"
50                | "h6"
51                | "header"
52                | "hgroup"
53                | "hr"
54                | "html"
55                | "li"
56                | "main"
57                | "nav"
58                | "ol"
59                | "p"
60                | "pre"
61                | "section"
62                | "table"
63                | "thead"
64                | "colgroup"
65                | "tbody"
66                | "tfoot"
67                | "tr"
68                | "caption"
69                | "td"
70                | "th"
71                | "ul"
72        )
73    }
74}
75
76impl NodeData {
77    fn text(&self) -> Option<&str> {
78        match *self {
79            NodeData::Text(TextData { ref text, .. })
80            | NodeData::Whitespace(TextData { ref text, .. }) => Some(text),
81            _ => None,
82        }
83    }
84
85    fn offset(&self) -> usize {
86        match *self {
87            NodeData::Text(TextData { offset, .. })
88            | NodeData::Whitespace(TextData { offset, .. })
89            | NodeData::Element(ElementData { offset, .. }) => offset,
90            NodeData::Wrapper(offset) => offset,
91            NodeData::Root => 0,
92        }
93    }
94}
95
96#[derive(Debug, Clone)]
97pub struct TextData {
98    pub offset: usize,
99    pub text: String,
100}
101
102pub fn element(name: &str, offset: usize, attributes: Attributes) -> NodeData {
103    let colon = name.find(':');
104    NodeData::Element(ElementData {
105        offset,
106        name: name[colon.map(|index| index + 1).unwrap_or(0)..].to_string(),
107        qualified_name: colon.map(|_| name.to_string()),
108        attributes,
109    })
110}
111
112pub fn text(text: &str, offset: usize) -> NodeData {
113    NodeData::Text(TextData {
114        offset,
115        text: text.to_string(),
116    })
117}
118
119pub fn whitespace(text: &str, offset: usize) -> NodeData {
120    NodeData::Whitespace(TextData {
121        offset,
122        text: text.to_string(),
123    })
124}
125
126#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
127pub struct NodeId(NonZeroUsize);
128
129impl NodeId {
130    pub fn from_index(n: usize) -> Self {
131        NodeId(unsafe { NonZeroUsize::new_unchecked(n + 1) })
132    }
133
134    pub fn to_index(self) -> usize {
135        self.0.get() - 1
136    }
137}
138
139#[derive(Debug, Clone)]
140pub struct XmlTree {
141    nodes: Vec<Node>,
142}
143
144#[derive(Debug, Clone)]
145pub struct Node {
146    data: NodeData,
147    parent: Option<NodeId>,
148    previous_sibling: Option<NodeId>,
149    next_sibling: Option<NodeId>,
150    first_child: Option<NodeId>,
151    last_child: Option<NodeId>,
152}
153
154impl Default for Node {
155    fn default() -> Self {
156        Node {
157            data: NodeData::Root,
158            parent: None,
159            previous_sibling: None,
160            next_sibling: None,
161            first_child: None,
162            last_child: None,
163        }
164    }
165}
166
167#[derive(Debug, Copy, Clone)]
168pub struct NodeRef<'a> {
169    pub id: NodeId,
170    pub node: &'a Node,
171    pub tree: &'a XmlTree,
172}
173
174#[derive(Debug)]
175pub struct NodeMut<'a> {
176    pub id: NodeId,
177    pub tree: &'a mut XmlTree,
178}
179
180impl XmlTree {
181    pub fn new() -> Self {
182        XmlTree {
183            nodes: vec![Node::default()],
184        }
185    }
186
187    fn node(&self, id: NodeId) -> &Node {
188        unsafe { self.nodes.get_unchecked(id.to_index()) }
189    }
190
191    fn node_mut(&mut self, id: NodeId) -> &mut Node {
192        unsafe { self.nodes.get_unchecked_mut(id.to_index()) }
193    }
194
195    pub fn get(&self, id: NodeId) -> NodeRef<'_> {
196        NodeRef {
197            id,
198            node: self.node(id),
199            tree: self,
200        }
201    }
202
203    pub fn get_mut(&mut self, id: NodeId) -> NodeMut<'_> {
204        NodeMut { id, tree: self }
205    }
206
207    pub fn root(&self) -> NodeRef<'_> {
208        self.get(NodeId::from_index(0))
209    }
210
211    pub fn root_mut(&mut self) -> NodeMut<'_> {
212        self.get_mut(NodeId::from_index(0))
213    }
214
215    pub fn push_node(&mut self, data: NodeData) -> NodeId {
216        let id = NodeId::from_index(self.nodes.len());
217        let node = Node {
218            data,
219            parent: None,
220            previous_sibling: None,
221            next_sibling: None,
222            first_child: None,
223            last_child: None,
224        };
225        self.nodes.push(node);
226        id
227    }
228
229    pub fn attach_child(&mut self, parent_id: NodeId, child_id: NodeId) {
230        let last_child = self.node(parent_id).last_child;
231
232        let child = self.node_mut(child_id);
233        child.parent = Some(parent_id);
234        child.previous_sibling = last_child;
235        child.next_sibling = None;
236
237        if let Some(last) = last_child {
238            self.node_mut(last).next_sibling = Some(child_id);
239        }
240
241        self.node_mut(parent_id).last_child = Some(child_id);
242
243        if self.node(parent_id).first_child.is_none() {
244            self.node_mut(parent_id).first_child = Some(child_id);
245        }
246    }
247
248    pub fn insert_before(&mut self, sibling_id: NodeId, new_id: NodeId) {
249        let parent_id = self.node(sibling_id).parent;
250        let prev_sibling = self.node(sibling_id).previous_sibling;
251
252        self.node_mut(new_id).parent = parent_id;
253        self.node_mut(new_id).previous_sibling = prev_sibling;
254        self.node_mut(new_id).next_sibling = Some(sibling_id);
255
256        self.node_mut(sibling_id).previous_sibling = Some(new_id);
257
258        if let Some(prev) = prev_sibling {
259            self.node_mut(prev).next_sibling = Some(new_id);
260        } else if let Some(parent) = parent_id {
261            self.node_mut(parent).first_child = Some(new_id);
262        }
263    }
264
265    pub fn detach(&mut self, id: NodeId) {
266        let node = self.node(id);
267        let parent_id = node.parent;
268        let prev_sibling = node.previous_sibling;
269        let next_sibling = node.next_sibling;
270
271        if let Some(prev) = prev_sibling {
272            self.node_mut(prev).next_sibling = next_sibling;
273        } else if let Some(parent) = parent_id {
274            self.node_mut(parent).first_child = next_sibling;
275        }
276
277        if let Some(next) = next_sibling {
278            self.node_mut(next).previous_sibling = prev_sibling;
279        } else if let Some(parent) = parent_id {
280            self.node_mut(parent).last_child = prev_sibling;
281        }
282
283        self.node_mut(id).parent = None;
284        self.node_mut(id).previous_sibling = None;
285        self.node_mut(id).next_sibling = None;
286    }
287
288    pub fn append_text_to(&mut self, id: NodeId, extra: &str) {
289        match &mut self.node_mut(id).data {
290            NodeData::Text(TextData { ref mut text, .. })
291            | NodeData::Whitespace(TextData { ref mut text, .. }) => {
292                text.push_str(extra);
293            }
294            _ => {}
295        }
296    }
297
298    pub fn add_attr_if_missing(&mut self, id: NodeId, name: &str, value: &str) {
299        if let NodeData::Element(ElementData {
300            ref mut attributes, ..
301        }) = self.node_mut(id).data
302        {
303            if !attributes.contains_key(name) {
304                attributes.insert(name.to_string(), value.to_string());
305            }
306        }
307    }
308
309    pub fn wrap_lost_inlines(&mut self) {
310        let mut ids = Vec::new();
311        let mut known_ids = FxHashSet::default();
312
313        for n in self.root().descendants().filter(|n| n.is_inline()) {
314            if known_ids.contains(&n.id) {
315                continue;
316            }
317
318            let mut first_id = None;
319            let mut last_id = None;
320
321            for s in n.previous_siblings() {
322                if s.is_block() {
323                    first_id = Some(s.next_sibling().unwrap().id);
324                    break;
325                } else {
326                    known_ids.insert(s.id);
327                }
328            }
329
330            for s in n.next_siblings() {
331                if s.is_block() {
332                    last_id = Some(s.previous_sibling().unwrap().id);
333                    break;
334                } else {
335                    known_ids.insert(s.id);
336                }
337            }
338
339            if first_id.is_some() || last_id.is_some() {
340                let parent = n.parent().unwrap();
341                ids.push([
342                    parent.id,
343                    first_id.unwrap_or_else(|| parent.node.first_child.unwrap()),
344                    last_id.unwrap_or_else(|| parent.node.last_child.unwrap()),
345                ]);
346            }
347        }
348
349        for [parent_id, first_id, last_id] in ids {
350            let offset = self.node(first_id).data.offset();
351            let mut node = self.get_mut(parent_id);
352            node.wrap_range(first_id, last_id, NodeData::Wrapper(offset));
353        }
354    }
355}
356
357impl<'a> NodeRef<'a> {
358    pub fn parent(&self) -> Option<Self> {
359        self.node.parent.map(|id| self.tree.get(id))
360    }
361
362    pub fn parent_element(&self) -> Option<Self> {
363        self.ancestors().find(|n| n.is_element() && !n.is_wrapper())
364    }
365
366    pub fn previous_sibling(&self) -> Option<Self> {
367        self.node.previous_sibling.map(|id| self.tree.get(id))
368    }
369
370    pub fn previous_sibling_element(&self) -> Option<NodeRef<'a>> {
371        self.previous_sibling_elements().next()
372    }
373
374    pub fn next_sibling_element(&self) -> Option<NodeRef<'a>> {
375        self.next_sibling_elements().next()
376    }
377
378    pub fn next_sibling(&self) -> Option<Self> {
379        self.node.next_sibling.map(|id| self.tree.get(id))
380    }
381
382    pub fn first_child(&self) -> Option<Self> {
383        self.node.first_child.map(|id| self.tree.get(id))
384    }
385
386    pub fn last_child(&self) -> Option<Self> {
387        self.node.last_child.map(|id| self.tree.get(id))
388    }
389
390    pub fn ancestors(&self) -> Ancestors<'a> {
391        Ancestors {
392            next: self.parent(),
393        }
394    }
395
396    pub fn ancestor_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
397        self.ancestors()
398            .filter(|n| n.is_element() && !n.is_wrapper())
399    }
400
401    pub fn previous_siblings(&self) -> PreviousSiblings<'a> {
402        PreviousSiblings {
403            next: self.previous_sibling(),
404        }
405    }
406
407    pub fn next_siblings(&self) -> NextSiblings<'a> {
408        NextSiblings {
409            next: self.next_sibling(),
410        }
411    }
412
413    pub fn previous_sibling_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
414        self.previous_siblings().filter(|n| n.is_element())
415    }
416
417    pub fn next_sibling_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
418        self.next_siblings().filter(|n| n.is_element())
419    }
420
421    pub fn children(&self) -> Children<'a> {
422        Children {
423            next: self.first_child(),
424        }
425    }
426
427    pub fn descendants(&self) -> Descendants<'a> {
428        Descendants {
429            root_id: self.id,
430            next: self.first_child(),
431        }
432    }
433
434    pub fn has_children(&self) -> bool {
435        self.node.first_child.is_some()
436    }
437
438    pub fn is_element(&self) -> bool {
439        matches!(
440            self.node.data,
441            NodeData::Element { .. } | NodeData::Wrapper(..) | NodeData::Root
442        )
443    }
444
445    pub fn is_inline(&self) -> bool {
446        match &self.node.data {
447            NodeData::Element(e) => !e.is_block(),
448            NodeData::Text(..) => true,
449            _ => false,
450        }
451    }
452
453    pub fn is_block(&self) -> bool {
454        match &self.node.data {
455            NodeData::Element(e) => e.is_block(),
456            NodeData::Wrapper(..) | NodeData::Root => true,
457            _ => false,
458        }
459    }
460
461    pub fn is_wrapper(&self) -> bool {
462        matches!(self.node.data, NodeData::Wrapper(..))
463    }
464
465    pub fn data(&self) -> &'a NodeData {
466        &self.node.data
467    }
468
469    pub fn offset(&self) -> usize {
470        self.node.data.offset()
471    }
472
473    pub fn text(&self) -> String {
474        self.node.data.text().map(String::from).unwrap_or_else(|| {
475            self.descendants()
476                .filter_map(|n| n.node.data.text())
477                .fold(String::new(), |mut a, b| {
478                    a.push_str(b);
479                    a
480                })
481        })
482    }
483
484    pub fn tag_name(&self) -> Option<&'a str> {
485        match self.node.data {
486            NodeData::Element(ElementData { ref name, .. }) => Some(name),
487            NodeData::Wrapper(..) => Some(WRAPPER_TAG_NAME),
488            _ => None,
489        }
490    }
491
492    pub fn tag_qualified_name(&self) -> Option<&'a str> {
493        match self.node.data {
494            NodeData::Element(ElementData {
495                ref qualified_name, ..
496            }) => qualified_name.as_deref(),
497            _ => None,
498        }
499    }
500
501    pub fn attributes(&self) -> Option<&'a Attributes> {
502        match self.node.data {
503            NodeData::Element(ElementData { ref attributes, .. }) => Some(attributes),
504            _ => None,
505        }
506    }
507
508    pub fn attribute(&self, name: &str) -> Option<&'a str> {
509        self.attributes()
510            .and_then(|a| a.get(name).map(String::as_str))
511    }
512
513    pub fn classes(&self) -> impl Iterator<Item = &'a str> {
514        self.attribute("class").unwrap_or("").split_whitespace()
515    }
516
517    pub fn id(&self) -> Option<&str> {
518        self.attribute("id")
519    }
520
521    pub fn find(&self, tag_name: &str) -> Option<Self> {
522        self.descendants().find(|n| n.tag_name() == Some(tag_name))
523    }
524
525    pub fn find_by_id(&self, id: &str) -> Option<Self> {
526        self.descendants().find(|n| n.id() == Some(id))
527    }
528}
529
530impl<'a> NodeMut<'a> {
531    fn node(&mut self) -> &mut Node {
532        self.tree.node_mut(self.id)
533    }
534
535    pub fn append(&mut self, data: NodeData) -> NodeId {
536        let id = NodeId::from_index(self.tree.nodes.len());
537
538        let node = Node {
539            data,
540            parent: Some(self.id),
541            previous_sibling: self.node().last_child,
542            next_sibling: None,
543            first_child: None,
544            last_child: None,
545        };
546
547        self.tree.nodes.push(node);
548
549        if let Some(last_child) = self.node().last_child {
550            self.tree.node_mut(last_child).next_sibling = Some(id);
551        }
552
553        self.node().last_child = Some(id);
554
555        if self.node().first_child.is_none() {
556            self.node().first_child = Some(id);
557        }
558
559        id
560    }
561
562    pub fn wrap_range(&mut self, first_id: NodeId, last_id: NodeId, data: NodeData) {
563        let before = self.tree.node(first_id).previous_sibling;
564        let after = self.tree.node(last_id).next_sibling;
565        let id = NodeId::from_index(self.tree.nodes.len());
566
567        let node = Node {
568            data,
569            parent: Some(self.id),
570            previous_sibling: before,
571            next_sibling: after,
572            first_child: Some(first_id),
573            last_child: Some(last_id),
574        };
575
576        self.tree.nodes.push(node);
577
578        if let Some(before_id) = before {
579            self.tree.node_mut(before_id).next_sibling = Some(id);
580        }
581
582        if let Some(after_id) = after {
583            self.tree.node_mut(after_id).previous_sibling = Some(id);
584        }
585
586        if let Some(first_child_id) = self.node().first_child {
587            if first_child_id == first_id {
588                self.node().first_child = Some(id);
589            }
590        }
591
592        if let Some(last_child_id) = self.node().last_child {
593            if last_child_id == last_id {
594                self.node().last_child = Some(id);
595            }
596        }
597
598        self.tree.node_mut(first_id).previous_sibling = None;
599        self.tree.node_mut(last_id).next_sibling = None;
600        self.tree.node_mut(first_id).parent = Some(id);
601
602        let mut node_id = first_id;
603        while let Some(next_id) = self.tree.node(node_id).next_sibling {
604            self.tree.node_mut(next_id).parent = Some(id);
605            node_id = next_id;
606        }
607    }
608}
609
610pub struct Ancestors<'a> {
611    next: Option<NodeRef<'a>>,
612}
613
614pub struct NextSiblings<'a> {
615    next: Option<NodeRef<'a>>,
616}
617
618pub struct PreviousSiblings<'a> {
619    next: Option<NodeRef<'a>>,
620}
621
622pub struct Children<'a> {
623    next: Option<NodeRef<'a>>,
624}
625
626pub struct Descendants<'a> {
627    root_id: NodeId,
628    next: Option<NodeRef<'a>>,
629}
630
631impl<'a> Iterator for Ancestors<'a> {
632    type Item = NodeRef<'a>;
633
634    fn next(&mut self) -> Option<Self::Item> {
635        let node = self.next.take();
636        self.next = node.as_ref().and_then(|node| node.parent());
637        node
638    }
639}
640
641impl<'a> Iterator for PreviousSiblings<'a> {
642    type Item = NodeRef<'a>;
643
644    fn next(&mut self) -> Option<Self::Item> {
645        let node = self.next.take();
646        self.next = node.as_ref().and_then(|node| node.previous_sibling());
647        node
648    }
649}
650
651impl<'a> Iterator for NextSiblings<'a> {
652    type Item = NodeRef<'a>;
653
654    fn next(&mut self) -> Option<Self::Item> {
655        let node = self.next.take();
656        self.next = node.as_ref().and_then(|node| node.next_sibling());
657        node
658    }
659}
660
661impl<'a> Iterator for Children<'a> {
662    type Item = NodeRef<'a>;
663
664    fn next(&mut self) -> Option<Self::Item> {
665        let node = self.next.take();
666        self.next = node.as_ref().and_then(|node| node.next_sibling());
667        node
668    }
669}
670
671impl<'a> Iterator for Descendants<'a> {
672    type Item = NodeRef<'a>;
673
674    fn next(&mut self) -> Option<Self::Item> {
675        let node = self.next.take();
676        if let Some(node) = node {
677            self.next = node
678                .first_child()
679                .or_else(|| node.next_sibling())
680                .or_else(|| {
681                    node.ancestors()
682                        .take_while(|n| n.id != self.root_id)
683                        .find(|n| n.node.next_sibling.is_some())
684                        .and_then(|n| n.next_sibling())
685                });
686        }
687        node
688    }
689}