cadmus_core/document/html/
dom.rs

1use fxhash::{FxHashMap, FxHashSet};
2use std::num::NonZeroUsize;
3
4pub type Attributes = FxHashMap<String, String>;
5pub const WRAPPER_TAG_NAME: &str = "anonymous";
6
7#[derive(Debug, Clone)]
8pub enum NodeData {
9    Root,
10    Wrapper(usize),
11    Element(ElementData),
12    Text(TextData),
13    Whitespace(TextData),
14}
15
16#[derive(Debug, Clone)]
17pub struct ElementData {
18    pub offset: usize,
19    pub name: String,
20    pub qualified_name: Option<String>,
21    pub attributes: Attributes,
22}
23
24impl ElementData {
25    fn is_block(&self) -> bool {
26        matches!(
27            self.name.as_str(),
28            "address"
29                | "article"
30                | "aside"
31                | "blockquote"
32                | "body"
33                | "head"
34                | "details"
35                | "dialog"
36                | "dd"
37                | "div"
38                | "dl"
39                | "dt"
40                | "fieldset"
41                | "figcaption"
42                | "figure"
43                | "footer"
44                | "form"
45                | "h1"
46                | "h2"
47                | "h3"
48                | "h4"
49                | "h5"
50                | "h6"
51                | "header"
52                | "hgroup"
53                | "hr"
54                | "html"
55                | "li"
56                | "main"
57                | "nav"
58                | "ol"
59                | "p"
60                | "pre"
61                | "section"
62                | "table"
63                | "thead"
64                | "colgroup"
65                | "tbody"
66                | "tfoot"
67                | "tr"
68                | "caption"
69                | "td"
70                | "th"
71                | "ul"
72        )
73    }
74}
75
76impl NodeData {
77    fn text(&self) -> Option<&str> {
78        match *self {
79            NodeData::Text(TextData { ref text, .. })
80            | NodeData::Whitespace(TextData { ref text, .. }) => Some(text),
81            _ => None,
82        }
83    }
84
85    fn offset(&self) -> usize {
86        match *self {
87            NodeData::Text(TextData { offset, .. })
88            | NodeData::Whitespace(TextData { offset, .. })
89            | NodeData::Element(ElementData { offset, .. }) => offset,
90            NodeData::Wrapper(offset) => offset,
91            NodeData::Root => 0,
92        }
93    }
94}
95
96#[derive(Debug, Clone)]
97pub struct TextData {
98    pub offset: usize,
99    pub text: String,
100}
101
102pub fn element(name: &str, offset: usize, attributes: Attributes) -> NodeData {
103    let colon = name.find(':');
104    NodeData::Element(ElementData {
105        offset,
106        name: name[colon.map(|index| index + 1).unwrap_or(0)..].to_string(),
107        qualified_name: colon.map(|_| name.to_string()),
108        attributes,
109    })
110}
111
112pub fn text(text: &str, offset: usize) -> NodeData {
113    NodeData::Text(TextData {
114        offset,
115        text: text.to_string(),
116    })
117}
118
119pub fn whitespace(text: &str, offset: usize) -> NodeData {
120    NodeData::Whitespace(TextData {
121        offset,
122        text: text.to_string(),
123    })
124}
125
126#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
127pub struct NodeId(NonZeroUsize);
128
129impl NodeId {
130    pub fn from_index(n: usize) -> Self {
131        NodeId(unsafe { NonZeroUsize::new_unchecked(n + 1) })
132    }
133
134    pub fn to_index(self) -> usize {
135        self.0.get() - 1
136    }
137}
138
139#[derive(Debug, Clone)]
140pub struct XmlTree {
141    nodes: Vec<Node>,
142}
143
144#[derive(Debug, Clone)]
145pub struct Node {
146    data: NodeData,
147    parent: Option<NodeId>,
148    previous_sibling: Option<NodeId>,
149    next_sibling: Option<NodeId>,
150    first_child: Option<NodeId>,
151    last_child: Option<NodeId>,
152}
153
154impl Default for Node {
155    fn default() -> Self {
156        Node {
157            data: NodeData::Root,
158            parent: None,
159            previous_sibling: None,
160            next_sibling: None,
161            first_child: None,
162            last_child: None,
163        }
164    }
165}
166
167#[derive(Debug, Copy, Clone)]
168pub struct NodeRef<'a> {
169    pub id: NodeId,
170    pub node: &'a Node,
171    pub tree: &'a XmlTree,
172}
173
174#[derive(Debug)]
175pub struct NodeMut<'a> {
176    pub id: NodeId,
177    pub tree: &'a mut XmlTree,
178}
179
180impl XmlTree {
181    pub fn new() -> Self {
182        XmlTree {
183            nodes: vec![Node::default()],
184        }
185    }
186
187    fn node(&self, id: NodeId) -> &Node {
188        unsafe { self.nodes.get_unchecked(id.to_index()) }
189    }
190
191    fn node_mut(&mut self, id: NodeId) -> &mut Node {
192        unsafe { self.nodes.get_unchecked_mut(id.to_index()) }
193    }
194
195    pub fn get(&self, id: NodeId) -> NodeRef<'_> {
196        NodeRef {
197            id,
198            node: self.node(id),
199            tree: self,
200        }
201    }
202
203    pub fn get_mut(&mut self, id: NodeId) -> NodeMut<'_> {
204        NodeMut { id, tree: self }
205    }
206
207    pub fn root(&self) -> NodeRef<'_> {
208        self.get(NodeId::from_index(0))
209    }
210
211    pub fn root_mut(&mut self) -> NodeMut<'_> {
212        self.get_mut(NodeId::from_index(0))
213    }
214
215    pub fn wrap_lost_inlines(&mut self) {
216        let mut ids = Vec::new();
217        let mut known_ids = FxHashSet::default();
218
219        for n in self.root().descendants().filter(|n| n.is_inline()) {
220            if known_ids.contains(&n.id) {
221                continue;
222            }
223
224            let mut first_id = None;
225            let mut last_id = None;
226
227            for s in n.previous_siblings() {
228                if s.is_block() {
229                    first_id = Some(s.next_sibling().unwrap().id);
230                    break;
231                } else {
232                    known_ids.insert(s.id);
233                }
234            }
235
236            for s in n.next_siblings() {
237                if s.is_block() {
238                    last_id = Some(s.previous_sibling().unwrap().id);
239                    break;
240                } else {
241                    known_ids.insert(s.id);
242                }
243            }
244
245            if first_id.is_some() || last_id.is_some() {
246                let parent = n.parent().unwrap();
247                ids.push([
248                    parent.id,
249                    first_id.unwrap_or_else(|| parent.node.first_child.unwrap()),
250                    last_id.unwrap_or_else(|| parent.node.last_child.unwrap()),
251                ]);
252            }
253        }
254
255        for [parent_id, first_id, last_id] in ids {
256            let offset = self.node(first_id).data.offset();
257            let mut node = self.get_mut(parent_id);
258            node.wrap_range(first_id, last_id, NodeData::Wrapper(offset));
259        }
260    }
261}
262
263impl<'a> NodeRef<'a> {
264    pub fn parent(&self) -> Option<Self> {
265        self.node.parent.map(|id| self.tree.get(id))
266    }
267
268    pub fn parent_element(&self) -> Option<Self> {
269        self.ancestors().find(|n| n.is_element() && !n.is_wrapper())
270    }
271
272    pub fn previous_sibling(&self) -> Option<Self> {
273        self.node.previous_sibling.map(|id| self.tree.get(id))
274    }
275
276    pub fn previous_sibling_element(&self) -> Option<NodeRef<'a>> {
277        self.previous_sibling_elements().next()
278    }
279
280    pub fn next_sibling_element(&self) -> Option<NodeRef<'a>> {
281        self.next_sibling_elements().next()
282    }
283
284    pub fn next_sibling(&self) -> Option<Self> {
285        self.node.next_sibling.map(|id| self.tree.get(id))
286    }
287
288    pub fn first_child(&self) -> Option<Self> {
289        self.node.first_child.map(|id| self.tree.get(id))
290    }
291
292    pub fn last_child(&self) -> Option<Self> {
293        self.node.last_child.map(|id| self.tree.get(id))
294    }
295
296    pub fn ancestors(&self) -> Ancestors<'a> {
297        Ancestors {
298            next: self.parent(),
299        }
300    }
301
302    pub fn ancestor_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
303        self.ancestors()
304            .filter(|n| n.is_element() && !n.is_wrapper())
305    }
306
307    pub fn previous_siblings(&self) -> PreviousSiblings<'a> {
308        PreviousSiblings {
309            next: self.previous_sibling(),
310        }
311    }
312
313    pub fn next_siblings(&self) -> NextSiblings<'a> {
314        NextSiblings {
315            next: self.next_sibling(),
316        }
317    }
318
319    pub fn previous_sibling_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
320        self.previous_siblings().filter(|n| n.is_element())
321    }
322
323    pub fn next_sibling_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
324        self.next_siblings().filter(|n| n.is_element())
325    }
326
327    pub fn children(&self) -> Children<'a> {
328        Children {
329            next: self.first_child(),
330        }
331    }
332
333    pub fn descendants(&self) -> Descendants<'a> {
334        Descendants {
335            root_id: self.id,
336            next: self.first_child(),
337        }
338    }
339
340    pub fn has_children(&self) -> bool {
341        self.node.first_child.is_some()
342    }
343
344    pub fn is_element(&self) -> bool {
345        matches!(
346            self.node.data,
347            NodeData::Element { .. } | NodeData::Wrapper(..) | NodeData::Root
348        )
349    }
350
351    pub fn is_inline(&self) -> bool {
352        match &self.node.data {
353            NodeData::Element(e) => !e.is_block(),
354            NodeData::Text(..) => true,
355            _ => false,
356        }
357    }
358
359    pub fn is_block(&self) -> bool {
360        match &self.node.data {
361            NodeData::Element(e) => e.is_block(),
362            NodeData::Wrapper(..) | NodeData::Root => true,
363            _ => false,
364        }
365    }
366
367    pub fn is_wrapper(&self) -> bool {
368        matches!(self.node.data, NodeData::Wrapper(..))
369    }
370
371    pub fn data(&self) -> &'a NodeData {
372        &self.node.data
373    }
374
375    pub fn offset(&self) -> usize {
376        self.node.data.offset()
377    }
378
379    pub fn text(&self) -> String {
380        self.node.data.text().map(String::from).unwrap_or_else(|| {
381            self.descendants()
382                .filter_map(|n| n.node.data.text())
383                .fold(String::new(), |mut a, b| {
384                    a.push_str(b);
385                    a
386                })
387        })
388    }
389
390    pub fn tag_name(&self) -> Option<&'a str> {
391        match self.node.data {
392            NodeData::Element(ElementData { ref name, .. }) => Some(name),
393            NodeData::Wrapper(..) => Some(WRAPPER_TAG_NAME),
394            _ => None,
395        }
396    }
397
398    pub fn tag_qualified_name(&self) -> Option<&'a str> {
399        match self.node.data {
400            NodeData::Element(ElementData {
401                ref qualified_name, ..
402            }) => qualified_name.as_deref(),
403            _ => None,
404        }
405    }
406
407    pub fn attributes(&self) -> Option<&'a Attributes> {
408        match self.node.data {
409            NodeData::Element(ElementData { ref attributes, .. }) => Some(attributes),
410            _ => None,
411        }
412    }
413
414    pub fn attribute(&self, name: &str) -> Option<&'a str> {
415        self.attributes()
416            .and_then(|a| a.get(name).map(String::as_str))
417    }
418
419    pub fn classes(&self) -> impl Iterator<Item = &'a str> {
420        self.attribute("class").unwrap_or("").split_whitespace()
421    }
422
423    pub fn id(&self) -> Option<&str> {
424        self.attribute("id")
425    }
426
427    pub fn find(&self, tag_name: &str) -> Option<Self> {
428        self.descendants().find(|n| n.tag_name() == Some(tag_name))
429    }
430
431    pub fn find_by_id(&self, id: &str) -> Option<Self> {
432        self.descendants().find(|n| n.id() == Some(id))
433    }
434}
435
436impl<'a> NodeMut<'a> {
437    fn node(&mut self) -> &mut Node {
438        self.tree.node_mut(self.id)
439    }
440
441    pub fn append(&mut self, data: NodeData) -> NodeId {
442        let id = NodeId::from_index(self.tree.nodes.len());
443
444        let node = Node {
445            data,
446            parent: Some(self.id),
447            previous_sibling: self.node().last_child,
448            next_sibling: None,
449            first_child: None,
450            last_child: None,
451        };
452
453        self.tree.nodes.push(node);
454
455        if let Some(last_child) = self.node().last_child {
456            self.tree.node_mut(last_child).next_sibling = Some(id);
457        }
458
459        self.node().last_child = Some(id);
460
461        if self.node().first_child.is_none() {
462            self.node().first_child = Some(id);
463        }
464
465        id
466    }
467
468    pub fn wrap_range(&mut self, first_id: NodeId, last_id: NodeId, data: NodeData) {
469        let before = self.tree.node(first_id).previous_sibling;
470        let after = self.tree.node(last_id).next_sibling;
471        let id = NodeId::from_index(self.tree.nodes.len());
472
473        let node = Node {
474            data,
475            parent: Some(self.id),
476            previous_sibling: before,
477            next_sibling: after,
478            first_child: Some(first_id),
479            last_child: Some(last_id),
480        };
481
482        self.tree.nodes.push(node);
483
484        if let Some(before_id) = before {
485            self.tree.node_mut(before_id).next_sibling = Some(id);
486        }
487
488        if let Some(after_id) = after {
489            self.tree.node_mut(after_id).previous_sibling = Some(id);
490        }
491
492        if let Some(first_child_id) = self.node().first_child {
493            if first_child_id == first_id {
494                self.node().first_child = Some(id);
495            }
496        }
497
498        if let Some(last_child_id) = self.node().last_child {
499            if last_child_id == last_id {
500                self.node().last_child = Some(id);
501            }
502        }
503
504        self.tree.node_mut(first_id).previous_sibling = None;
505        self.tree.node_mut(last_id).next_sibling = None;
506        self.tree.node_mut(first_id).parent = Some(id);
507
508        let mut node_id = first_id;
509        while let Some(next_id) = self.tree.node(node_id).next_sibling {
510            self.tree.node_mut(next_id).parent = Some(id);
511            node_id = next_id;
512        }
513    }
514}
515
516pub struct Ancestors<'a> {
517    next: Option<NodeRef<'a>>,
518}
519
520pub struct NextSiblings<'a> {
521    next: Option<NodeRef<'a>>,
522}
523
524pub struct PreviousSiblings<'a> {
525    next: Option<NodeRef<'a>>,
526}
527
528pub struct Children<'a> {
529    next: Option<NodeRef<'a>>,
530}
531
532pub struct Descendants<'a> {
533    root_id: NodeId,
534    next: Option<NodeRef<'a>>,
535}
536
537impl<'a> Iterator for Ancestors<'a> {
538    type Item = NodeRef<'a>;
539
540    fn next(&mut self) -> Option<Self::Item> {
541        let node = self.next.take();
542        self.next = node.as_ref().and_then(|node| node.parent());
543        node
544    }
545}
546
547impl<'a> Iterator for PreviousSiblings<'a> {
548    type Item = NodeRef<'a>;
549
550    fn next(&mut self) -> Option<Self::Item> {
551        let node = self.next.take();
552        self.next = node.as_ref().and_then(|node| node.previous_sibling());
553        node
554    }
555}
556
557impl<'a> Iterator for NextSiblings<'a> {
558    type Item = NodeRef<'a>;
559
560    fn next(&mut self) -> Option<Self::Item> {
561        let node = self.next.take();
562        self.next = node.as_ref().and_then(|node| node.next_sibling());
563        node
564    }
565}
566
567impl<'a> Iterator for Children<'a> {
568    type Item = NodeRef<'a>;
569
570    fn next(&mut self) -> Option<Self::Item> {
571        let node = self.next.take();
572        self.next = node.as_ref().and_then(|node| node.next_sibling());
573        node
574    }
575}
576
577impl<'a> Iterator for Descendants<'a> {
578    type Item = NodeRef<'a>;
579
580    fn next(&mut self) -> Option<Self::Item> {
581        let node = self.next.take();
582        if let Some(node) = node {
583            self.next = node
584                .first_child()
585                .or_else(|| node.next_sibling())
586                .or_else(|| {
587                    node.ancestors()
588                        .take_while(|n| n.id != self.root_id)
589                        .find(|n| n.node.next_sibling.is_some())
590                        .and_then(|n| n.next_sibling())
591                });
592        }
593        node
594    }
595}