cadmus_core/document/html/
xml.rs1use super::dom::{element, text, whitespace};
2use super::dom::{Attributes, NodeId, XmlTree};
3use fxhash::FxHashMap;
4
5#[derive(Debug)]
6pub struct XmlParser<'a> {
7 pub input: &'a str,
8 pub offset: usize,
9}
10
11impl<'a> XmlParser<'a> {
12 pub fn new(input: &str) -> XmlParser<'_> {
13 XmlParser { input, offset: 0 }
14 }
15
16 fn eof(&self) -> bool {
17 self.offset >= self.input.len()
18 }
19
20 fn next(&self) -> Option<char> {
21 self.input[self.offset..].chars().next()
22 }
23
24 fn starts_with(&self, s: &str) -> bool {
25 self.input[self.offset..].starts_with(s)
26 }
27
28 fn advance(&mut self, n: usize) {
29 for c in self.input[self.offset..].chars().take(n) {
30 self.offset += c.len_utf8();
31 }
32 }
33
34 fn advance_while<F>(&mut self, test: F)
35 where
36 F: FnMut(&char) -> bool,
37 {
38 for c in self.input[self.offset..].chars().take_while(test) {
39 self.offset += c.len_utf8();
40 }
41 }
42
43 fn advance_until(&mut self, target: &str) {
44 while !self.eof() && !self.starts_with(target) {
45 self.advance(1);
46 }
47 self.advance(target.chars().count());
48 }
49
50 fn parse_attributes(&mut self) -> Attributes {
51 let mut attrs = FxHashMap::default();
52 while !self.eof() {
53 self.advance_while(|&c| c.is_xml_whitespace());
54 match self.next() {
55 Some('>') | Some('/') | None => break,
56 _ => {
57 let offset = self.offset;
58 self.advance_while(|&c| c != '=');
59 let key = self.input[offset..self.offset].to_string();
60 self.advance_while(|&c| c != '"' && c != '\'');
61 let quote = self.next().unwrap_or('"');
62 self.advance(1);
63 let offset = self.offset;
64 self.advance_while(|&c| c != quote);
65 let value = self.input[offset..self.offset].to_string();
66 attrs.insert(key, value);
67 self.advance(1);
68 }
69 }
70 }
71 attrs
72 }
73
74 fn parse_element(&mut self, tree: &mut XmlTree, parent_id: NodeId) {
75 let offset = self.offset;
76 self.advance_while(|&c| c != '>' && c != '/' && !c.is_xml_whitespace());
77 let name = &self.input[offset..self.offset];
78 let attributes = self.parse_attributes();
79
80 match self.next() {
81 Some('/') => {
82 self.advance(2);
83 tree.get_mut(parent_id)
84 .append(element(name, offset - 1, attributes));
85 }
86 Some('>') => {
87 self.advance(1);
88 let id = tree
89 .get_mut(parent_id)
90 .append(element(name, offset - 1, attributes));
91 self.parse_nodes(tree, id);
92 }
93 _ => (),
94 }
95 }
96
97 fn parse_nodes(&mut self, tree: &mut XmlTree, parent_id: NodeId) {
98 while !self.eof() {
99 let offset = self.offset;
100 self.advance_while(|&c| c.is_xml_whitespace());
101
102 match self.next() {
103 Some('<') => {
104 if self.offset > offset {
105 tree.get_mut(parent_id)
106 .append(whitespace(&self.input[offset..self.offset], offset));
107 }
108 if self.starts_with("</") {
109 self.advance(2);
110 self.advance_while(|&c| c != '>');
111 self.advance(1);
112 break;
113 }
114 self.advance(1);
115 match self.next() {
116 Some('?') => {
117 self.advance(1);
118 self.advance_until("?>");
119 }
120 Some('!') => {
121 self.advance(1);
122 match self.next() {
123 Some('-') => {
124 self.advance(2);
125 self.advance_until("-->");
126 }
127 Some('[') => {
128 self.advance(1);
129 self.advance_until("]]>");
130 }
131 _ => {
132 self.advance_while(|&c| c != '>');
133 self.advance(1);
134 }
135 }
136 }
137 _ => self.parse_element(tree, parent_id),
138 }
139 }
140 Some(..) => {
141 self.advance_while(|&c| c != '<');
142 tree.get_mut(parent_id)
143 .append(text(&self.input[offset..self.offset], offset));
144 }
145 None => break,
146 }
147 }
148 }
149
150 pub fn parse(&mut self) -> XmlTree {
151 let mut tree = XmlTree::new();
152 self.parse_nodes(&mut tree, NodeId::from_index(0));
153 tree
154 }
155}
156
157pub trait XmlExt {
158 fn is_xml_whitespace(&self) -> bool;
159}
160
161impl XmlExt for char {
162 fn is_xml_whitespace(&self) -> bool {
163 matches!(self, ' ' | '\t' | '\n' | '\r')
164 }
165}
166
167#[cfg(test)]
168mod tests {
169 use super::*;
170
171 #[test]
172 fn test_simple_element() {
173 let text = "<a/>";
174 let xml = XmlParser::new(text).parse();
175 let n = xml.root().first_child().unwrap();
176 assert_eq!(n.offset(), 0);
177 assert_eq!(n.tag_name(), Some("a"));
178 }
179
180 #[test]
181 fn test_attributes() {
182 let text = r#"<a b="c" d='e"'/>"#;
183 let xml = XmlParser::new(text).parse();
184 let n = xml.root().first_child().unwrap();
185 assert_eq!(n.attribute("b"), Some("c"));
186 assert_eq!(n.attribute("d"), Some("e\""));
187 }
188
189 #[test]
190 fn test_text() {
191 let text = "<a>bcd</a>";
192 let xml = XmlParser::new(text).parse();
193 let child = xml.root().first_child().unwrap().children().next();
194 assert_eq!(child.map(|c| c.offset()), Some(3));
195 assert_eq!(child.map(|c| c.text()), Some("bcd".to_string()));
196 }
197
198 #[test]
199 fn test_inbetween_space() {
200 let text = "<a><b>x</b> <c>y</c></a>";
201 let xml = XmlParser::new(text).parse();
202 let child = xml.root().first_child().unwrap().children().nth(1);
203 assert_eq!(child.map(|c| c.text()), Some(" ".to_string()));
204 }
205
206 #[test]
207 fn test_central_space() {
208 let text = "<a><b> </b></a>";
209 let xml = XmlParser::new(text).parse();
210 assert_eq!(xml.root().text(), " ");
211 }
212}