cadmus_core/document/html/
dom.rs1use fxhash::{FxHashMap, FxHashSet};
2use std::num::NonZeroUsize;
3
4pub type Attributes = FxHashMap<String, String>;
5pub const WRAPPER_TAG_NAME: &str = "anonymous";
6
7#[derive(Debug, Clone)]
8pub enum NodeData {
9 Root,
10 Wrapper(usize),
11 Element(ElementData),
12 Text(TextData),
13 Whitespace(TextData),
14}
15
16#[derive(Debug, Clone)]
17pub struct ElementData {
18 pub offset: usize,
19 pub name: String,
20 pub qualified_name: Option<String>,
21 pub attributes: Attributes,
22}
23
24impl ElementData {
25 fn is_block(&self) -> bool {
26 matches!(
27 self.name.as_str(),
28 "address"
29 | "article"
30 | "aside"
31 | "blockquote"
32 | "body"
33 | "head"
34 | "details"
35 | "dialog"
36 | "dd"
37 | "div"
38 | "dl"
39 | "dt"
40 | "fieldset"
41 | "figcaption"
42 | "figure"
43 | "footer"
44 | "form"
45 | "h1"
46 | "h2"
47 | "h3"
48 | "h4"
49 | "h5"
50 | "h6"
51 | "header"
52 | "hgroup"
53 | "hr"
54 | "html"
55 | "li"
56 | "main"
57 | "nav"
58 | "ol"
59 | "p"
60 | "pre"
61 | "section"
62 | "table"
63 | "thead"
64 | "colgroup"
65 | "tbody"
66 | "tfoot"
67 | "tr"
68 | "caption"
69 | "td"
70 | "th"
71 | "ul"
72 )
73 }
74}
75
76impl NodeData {
77 fn text(&self) -> Option<&str> {
78 match *self {
79 NodeData::Text(TextData { ref text, .. })
80 | NodeData::Whitespace(TextData { ref text, .. }) => Some(text),
81 _ => None,
82 }
83 }
84
85 fn offset(&self) -> usize {
86 match *self {
87 NodeData::Text(TextData { offset, .. })
88 | NodeData::Whitespace(TextData { offset, .. })
89 | NodeData::Element(ElementData { offset, .. }) => offset,
90 NodeData::Wrapper(offset) => offset,
91 NodeData::Root => 0,
92 }
93 }
94}
95
96#[derive(Debug, Clone)]
97pub struct TextData {
98 pub offset: usize,
99 pub text: String,
100}
101
102pub fn element(name: &str, offset: usize, attributes: Attributes) -> NodeData {
103 let colon = name.find(':');
104 NodeData::Element(ElementData {
105 offset,
106 name: name[colon.map(|index| index + 1).unwrap_or(0)..].to_string(),
107 qualified_name: colon.map(|_| name.to_string()),
108 attributes,
109 })
110}
111
112pub fn text(text: &str, offset: usize) -> NodeData {
113 NodeData::Text(TextData {
114 offset,
115 text: text.to_string(),
116 })
117}
118
119pub fn whitespace(text: &str, offset: usize) -> NodeData {
120 NodeData::Whitespace(TextData {
121 offset,
122 text: text.to_string(),
123 })
124}
125
126#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
127pub struct NodeId(NonZeroUsize);
128
129impl NodeId {
130 pub fn from_index(n: usize) -> Self {
131 NodeId(unsafe { NonZeroUsize::new_unchecked(n + 1) })
132 }
133
134 pub fn to_index(self) -> usize {
135 self.0.get() - 1
136 }
137}
138
139#[derive(Debug, Clone)]
140pub struct XmlTree {
141 nodes: Vec<Node>,
142}
143
144#[derive(Debug, Clone)]
145pub struct Node {
146 data: NodeData,
147 parent: Option<NodeId>,
148 previous_sibling: Option<NodeId>,
149 next_sibling: Option<NodeId>,
150 first_child: Option<NodeId>,
151 last_child: Option<NodeId>,
152}
153
154impl Default for Node {
155 fn default() -> Self {
156 Node {
157 data: NodeData::Root,
158 parent: None,
159 previous_sibling: None,
160 next_sibling: None,
161 first_child: None,
162 last_child: None,
163 }
164 }
165}
166
167#[derive(Debug, Copy, Clone)]
168pub struct NodeRef<'a> {
169 pub id: NodeId,
170 pub node: &'a Node,
171 pub tree: &'a XmlTree,
172}
173
174#[derive(Debug)]
175pub struct NodeMut<'a> {
176 pub id: NodeId,
177 pub tree: &'a mut XmlTree,
178}
179
180impl XmlTree {
181 pub fn new() -> Self {
182 XmlTree {
183 nodes: vec![Node::default()],
184 }
185 }
186
187 fn node(&self, id: NodeId) -> &Node {
188 unsafe { self.nodes.get_unchecked(id.to_index()) }
189 }
190
191 fn node_mut(&mut self, id: NodeId) -> &mut Node {
192 unsafe { self.nodes.get_unchecked_mut(id.to_index()) }
193 }
194
195 pub fn get(&self, id: NodeId) -> NodeRef<'_> {
196 NodeRef {
197 id,
198 node: self.node(id),
199 tree: self,
200 }
201 }
202
203 pub fn get_mut(&mut self, id: NodeId) -> NodeMut<'_> {
204 NodeMut { id, tree: self }
205 }
206
207 pub fn root(&self) -> NodeRef<'_> {
208 self.get(NodeId::from_index(0))
209 }
210
211 pub fn root_mut(&mut self) -> NodeMut<'_> {
212 self.get_mut(NodeId::from_index(0))
213 }
214
215 pub fn push_node(&mut self, data: NodeData) -> NodeId {
216 let id = NodeId::from_index(self.nodes.len());
217 let node = Node {
218 data,
219 parent: None,
220 previous_sibling: None,
221 next_sibling: None,
222 first_child: None,
223 last_child: None,
224 };
225 self.nodes.push(node);
226 id
227 }
228
229 pub fn attach_child(&mut self, parent_id: NodeId, child_id: NodeId) {
230 let last_child = self.node(parent_id).last_child;
231
232 let child = self.node_mut(child_id);
233 child.parent = Some(parent_id);
234 child.previous_sibling = last_child;
235 child.next_sibling = None;
236
237 if let Some(last) = last_child {
238 self.node_mut(last).next_sibling = Some(child_id);
239 }
240
241 self.node_mut(parent_id).last_child = Some(child_id);
242
243 if self.node(parent_id).first_child.is_none() {
244 self.node_mut(parent_id).first_child = Some(child_id);
245 }
246 }
247
248 pub fn insert_before(&mut self, sibling_id: NodeId, new_id: NodeId) {
249 let parent_id = self.node(sibling_id).parent;
250 let prev_sibling = self.node(sibling_id).previous_sibling;
251
252 self.node_mut(new_id).parent = parent_id;
253 self.node_mut(new_id).previous_sibling = prev_sibling;
254 self.node_mut(new_id).next_sibling = Some(sibling_id);
255
256 self.node_mut(sibling_id).previous_sibling = Some(new_id);
257
258 if let Some(prev) = prev_sibling {
259 self.node_mut(prev).next_sibling = Some(new_id);
260 } else if let Some(parent) = parent_id {
261 self.node_mut(parent).first_child = Some(new_id);
262 }
263 }
264
265 pub fn detach(&mut self, id: NodeId) {
266 let node = self.node(id);
267 let parent_id = node.parent;
268 let prev_sibling = node.previous_sibling;
269 let next_sibling = node.next_sibling;
270
271 if let Some(prev) = prev_sibling {
272 self.node_mut(prev).next_sibling = next_sibling;
273 } else if let Some(parent) = parent_id {
274 self.node_mut(parent).first_child = next_sibling;
275 }
276
277 if let Some(next) = next_sibling {
278 self.node_mut(next).previous_sibling = prev_sibling;
279 } else if let Some(parent) = parent_id {
280 self.node_mut(parent).last_child = prev_sibling;
281 }
282
283 self.node_mut(id).parent = None;
284 self.node_mut(id).previous_sibling = None;
285 self.node_mut(id).next_sibling = None;
286 }
287
288 pub fn append_text_to(&mut self, id: NodeId, extra: &str) {
289 match &mut self.node_mut(id).data {
290 NodeData::Text(TextData { ref mut text, .. })
291 | NodeData::Whitespace(TextData { ref mut text, .. }) => {
292 text.push_str(extra);
293 }
294 _ => {}
295 }
296 }
297
298 pub fn add_attr_if_missing(&mut self, id: NodeId, name: &str, value: &str) {
299 if let NodeData::Element(ElementData {
300 ref mut attributes, ..
301 }) = self.node_mut(id).data
302 {
303 if !attributes.contains_key(name) {
304 attributes.insert(name.to_string(), value.to_string());
305 }
306 }
307 }
308
309 pub fn wrap_lost_inlines(&mut self) {
310 let mut ids = Vec::new();
311 let mut known_ids = FxHashSet::default();
312
313 for n in self.root().descendants().filter(|n| n.is_inline()) {
314 if known_ids.contains(&n.id) {
315 continue;
316 }
317
318 let mut first_id = None;
319 let mut last_id = None;
320
321 for s in n.previous_siblings() {
322 if s.is_block() {
323 first_id = Some(s.next_sibling().unwrap().id);
324 break;
325 } else {
326 known_ids.insert(s.id);
327 }
328 }
329
330 for s in n.next_siblings() {
331 if s.is_block() {
332 last_id = Some(s.previous_sibling().unwrap().id);
333 break;
334 } else {
335 known_ids.insert(s.id);
336 }
337 }
338
339 if first_id.is_some() || last_id.is_some() {
340 let parent = n.parent().unwrap();
341 ids.push([
342 parent.id,
343 first_id.unwrap_or_else(|| parent.node.first_child.unwrap()),
344 last_id.unwrap_or_else(|| parent.node.last_child.unwrap()),
345 ]);
346 }
347 }
348
349 for [parent_id, first_id, last_id] in ids {
350 let offset = self.node(first_id).data.offset();
351 let mut node = self.get_mut(parent_id);
352 node.wrap_range(first_id, last_id, NodeData::Wrapper(offset));
353 }
354 }
355}
356
357impl<'a> NodeRef<'a> {
358 pub fn parent(&self) -> Option<Self> {
359 self.node.parent.map(|id| self.tree.get(id))
360 }
361
362 pub fn parent_element(&self) -> Option<Self> {
363 self.ancestors().find(|n| n.is_element() && !n.is_wrapper())
364 }
365
366 pub fn previous_sibling(&self) -> Option<Self> {
367 self.node.previous_sibling.map(|id| self.tree.get(id))
368 }
369
370 pub fn previous_sibling_element(&self) -> Option<NodeRef<'a>> {
371 self.previous_sibling_elements().next()
372 }
373
374 pub fn next_sibling_element(&self) -> Option<NodeRef<'a>> {
375 self.next_sibling_elements().next()
376 }
377
378 pub fn next_sibling(&self) -> Option<Self> {
379 self.node.next_sibling.map(|id| self.tree.get(id))
380 }
381
382 pub fn first_child(&self) -> Option<Self> {
383 self.node.first_child.map(|id| self.tree.get(id))
384 }
385
386 pub fn last_child(&self) -> Option<Self> {
387 self.node.last_child.map(|id| self.tree.get(id))
388 }
389
390 pub fn ancestors(&self) -> Ancestors<'a> {
391 Ancestors {
392 next: self.parent(),
393 }
394 }
395
396 pub fn ancestor_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
397 self.ancestors()
398 .filter(|n| n.is_element() && !n.is_wrapper())
399 }
400
401 pub fn previous_siblings(&self) -> PreviousSiblings<'a> {
402 PreviousSiblings {
403 next: self.previous_sibling(),
404 }
405 }
406
407 pub fn next_siblings(&self) -> NextSiblings<'a> {
408 NextSiblings {
409 next: self.next_sibling(),
410 }
411 }
412
413 pub fn previous_sibling_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
414 self.previous_siblings().filter(|n| n.is_element())
415 }
416
417 pub fn next_sibling_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
418 self.next_siblings().filter(|n| n.is_element())
419 }
420
421 pub fn children(&self) -> Children<'a> {
422 Children {
423 next: self.first_child(),
424 }
425 }
426
427 pub fn descendants(&self) -> Descendants<'a> {
428 Descendants {
429 root_id: self.id,
430 next: self.first_child(),
431 }
432 }
433
434 pub fn has_children(&self) -> bool {
435 self.node.first_child.is_some()
436 }
437
438 pub fn is_element(&self) -> bool {
439 matches!(
440 self.node.data,
441 NodeData::Element { .. } | NodeData::Wrapper(..) | NodeData::Root
442 )
443 }
444
445 pub fn is_inline(&self) -> bool {
446 match &self.node.data {
447 NodeData::Element(e) => !e.is_block(),
448 NodeData::Text(..) => true,
449 _ => false,
450 }
451 }
452
453 pub fn is_block(&self) -> bool {
454 match &self.node.data {
455 NodeData::Element(e) => e.is_block(),
456 NodeData::Wrapper(..) | NodeData::Root => true,
457 _ => false,
458 }
459 }
460
461 pub fn is_wrapper(&self) -> bool {
462 matches!(self.node.data, NodeData::Wrapper(..))
463 }
464
465 pub fn data(&self) -> &'a NodeData {
466 &self.node.data
467 }
468
469 pub fn offset(&self) -> usize {
470 self.node.data.offset()
471 }
472
473 pub fn text(&self) -> String {
474 self.node.data.text().map(String::from).unwrap_or_else(|| {
475 self.descendants()
476 .filter_map(|n| n.node.data.text())
477 .fold(String::new(), |mut a, b| {
478 a.push_str(b);
479 a
480 })
481 })
482 }
483
484 pub fn tag_name(&self) -> Option<&'a str> {
485 match self.node.data {
486 NodeData::Element(ElementData { ref name, .. }) => Some(name),
487 NodeData::Wrapper(..) => Some(WRAPPER_TAG_NAME),
488 _ => None,
489 }
490 }
491
492 pub fn tag_qualified_name(&self) -> Option<&'a str> {
493 match self.node.data {
494 NodeData::Element(ElementData {
495 ref qualified_name, ..
496 }) => qualified_name.as_deref(),
497 _ => None,
498 }
499 }
500
501 pub fn attributes(&self) -> Option<&'a Attributes> {
502 match self.node.data {
503 NodeData::Element(ElementData { ref attributes, .. }) => Some(attributes),
504 _ => None,
505 }
506 }
507
508 pub fn attribute(&self, name: &str) -> Option<&'a str> {
509 self.attributes()
510 .and_then(|a| a.get(name).map(String::as_str))
511 }
512
513 pub fn classes(&self) -> impl Iterator<Item = &'a str> {
514 self.attribute("class").unwrap_or("").split_whitespace()
515 }
516
517 pub fn id(&self) -> Option<&str> {
518 self.attribute("id")
519 }
520
521 pub fn find(&self, tag_name: &str) -> Option<Self> {
522 self.descendants().find(|n| n.tag_name() == Some(tag_name))
523 }
524
525 pub fn find_by_id(&self, id: &str) -> Option<Self> {
526 self.descendants().find(|n| n.id() == Some(id))
527 }
528}
529
530impl<'a> NodeMut<'a> {
531 fn node(&mut self) -> &mut Node {
532 self.tree.node_mut(self.id)
533 }
534
535 pub fn append(&mut self, data: NodeData) -> NodeId {
536 let id = NodeId::from_index(self.tree.nodes.len());
537
538 let node = Node {
539 data,
540 parent: Some(self.id),
541 previous_sibling: self.node().last_child,
542 next_sibling: None,
543 first_child: None,
544 last_child: None,
545 };
546
547 self.tree.nodes.push(node);
548
549 if let Some(last_child) = self.node().last_child {
550 self.tree.node_mut(last_child).next_sibling = Some(id);
551 }
552
553 self.node().last_child = Some(id);
554
555 if self.node().first_child.is_none() {
556 self.node().first_child = Some(id);
557 }
558
559 id
560 }
561
562 pub fn wrap_range(&mut self, first_id: NodeId, last_id: NodeId, data: NodeData) {
563 let before = self.tree.node(first_id).previous_sibling;
564 let after = self.tree.node(last_id).next_sibling;
565 let id = NodeId::from_index(self.tree.nodes.len());
566
567 let node = Node {
568 data,
569 parent: Some(self.id),
570 previous_sibling: before,
571 next_sibling: after,
572 first_child: Some(first_id),
573 last_child: Some(last_id),
574 };
575
576 self.tree.nodes.push(node);
577
578 if let Some(before_id) = before {
579 self.tree.node_mut(before_id).next_sibling = Some(id);
580 }
581
582 if let Some(after_id) = after {
583 self.tree.node_mut(after_id).previous_sibling = Some(id);
584 }
585
586 if let Some(first_child_id) = self.node().first_child {
587 if first_child_id == first_id {
588 self.node().first_child = Some(id);
589 }
590 }
591
592 if let Some(last_child_id) = self.node().last_child {
593 if last_child_id == last_id {
594 self.node().last_child = Some(id);
595 }
596 }
597
598 self.tree.node_mut(first_id).previous_sibling = None;
599 self.tree.node_mut(last_id).next_sibling = None;
600 self.tree.node_mut(first_id).parent = Some(id);
601
602 let mut node_id = first_id;
603 while let Some(next_id) = self.tree.node(node_id).next_sibling {
604 self.tree.node_mut(next_id).parent = Some(id);
605 node_id = next_id;
606 }
607 }
608}
609
610pub struct Ancestors<'a> {
611 next: Option<NodeRef<'a>>,
612}
613
614pub struct NextSiblings<'a> {
615 next: Option<NodeRef<'a>>,
616}
617
618pub struct PreviousSiblings<'a> {
619 next: Option<NodeRef<'a>>,
620}
621
622pub struct Children<'a> {
623 next: Option<NodeRef<'a>>,
624}
625
626pub struct Descendants<'a> {
627 root_id: NodeId,
628 next: Option<NodeRef<'a>>,
629}
630
631impl<'a> Iterator for Ancestors<'a> {
632 type Item = NodeRef<'a>;
633
634 fn next(&mut self) -> Option<Self::Item> {
635 let node = self.next.take();
636 self.next = node.as_ref().and_then(|node| node.parent());
637 node
638 }
639}
640
641impl<'a> Iterator for PreviousSiblings<'a> {
642 type Item = NodeRef<'a>;
643
644 fn next(&mut self) -> Option<Self::Item> {
645 let node = self.next.take();
646 self.next = node.as_ref().and_then(|node| node.previous_sibling());
647 node
648 }
649}
650
651impl<'a> Iterator for NextSiblings<'a> {
652 type Item = NodeRef<'a>;
653
654 fn next(&mut self) -> Option<Self::Item> {
655 let node = self.next.take();
656 self.next = node.as_ref().and_then(|node| node.next_sibling());
657 node
658 }
659}
660
661impl<'a> Iterator for Children<'a> {
662 type Item = NodeRef<'a>;
663
664 fn next(&mut self) -> Option<Self::Item> {
665 let node = self.next.take();
666 self.next = node.as_ref().and_then(|node| node.next_sibling());
667 node
668 }
669}
670
671impl<'a> Iterator for Descendants<'a> {
672 type Item = NodeRef<'a>;
673
674 fn next(&mut self) -> Option<Self::Item> {
675 let node = self.next.take();
676 if let Some(node) = node {
677 self.next = node
678 .first_child()
679 .or_else(|| node.next_sibling())
680 .or_else(|| {
681 node.ancestors()
682 .take_while(|n| n.id != self.root_id)
683 .find(|n| n.node.next_sibling.is_some())
684 .and_then(|n| n.next_sibling())
685 });
686 }
687 node
688 }
689}