cadmus_core/document/html/
xml.rs

1use super::dom::{element, text, whitespace};
2use super::dom::{Attributes, NodeId, XmlTree};
3use fxhash::FxHashMap;
4
5#[derive(Debug)]
6pub struct XmlParser<'a> {
7    pub input: &'a str,
8    pub offset: usize,
9}
10
11impl<'a> XmlParser<'a> {
12    pub fn new(input: &str) -> XmlParser<'_> {
13        XmlParser { input, offset: 0 }
14    }
15
16    fn eof(&self) -> bool {
17        self.offset >= self.input.len()
18    }
19
20    fn next(&self) -> Option<char> {
21        self.input[self.offset..].chars().next()
22    }
23
24    fn starts_with(&self, s: &str) -> bool {
25        self.input[self.offset..].starts_with(s)
26    }
27
28    fn advance(&mut self, n: usize) {
29        for c in self.input[self.offset..].chars().take(n) {
30            self.offset += c.len_utf8();
31        }
32    }
33
34    fn advance_while<F>(&mut self, test: F)
35    where
36        F: FnMut(&char) -> bool,
37    {
38        for c in self.input[self.offset..].chars().take_while(test) {
39            self.offset += c.len_utf8();
40        }
41    }
42
43    fn advance_until(&mut self, target: &str) {
44        while !self.eof() && !self.starts_with(target) {
45            self.advance(1);
46        }
47        self.advance(target.chars().count());
48    }
49
50    fn parse_attributes(&mut self) -> Attributes {
51        let mut attrs = FxHashMap::default();
52        while !self.eof() {
53            self.advance_while(|&c| c.is_xml_whitespace());
54            match self.next() {
55                Some('>') | Some('/') | None => break,
56                _ => {
57                    let offset = self.offset;
58                    self.advance_while(|&c| c != '=');
59                    let key = self.input[offset..self.offset].to_string();
60                    self.advance_while(|&c| c != '"' && c != '\'');
61                    let quote = self.next().unwrap_or('"');
62                    self.advance(1);
63                    let offset = self.offset;
64                    self.advance_while(|&c| c != quote);
65                    let value = self.input[offset..self.offset].to_string();
66                    attrs.insert(key, value);
67                    self.advance(1);
68                }
69            }
70        }
71        attrs
72    }
73
74    fn parse_element(&mut self, tree: &mut XmlTree, parent_id: NodeId) {
75        let offset = self.offset;
76        self.advance_while(|&c| c != '>' && c != '/' && !c.is_xml_whitespace());
77        let name = &self.input[offset..self.offset];
78        let attributes = self.parse_attributes();
79
80        match self.next() {
81            Some('/') => {
82                self.advance(2);
83                tree.get_mut(parent_id)
84                    .append(element(name, offset - 1, attributes));
85            }
86            Some('>') => {
87                self.advance(1);
88                let id = tree
89                    .get_mut(parent_id)
90                    .append(element(name, offset - 1, attributes));
91                self.parse_nodes(tree, id);
92            }
93            _ => (),
94        }
95    }
96
97    fn parse_nodes(&mut self, tree: &mut XmlTree, parent_id: NodeId) {
98        while !self.eof() {
99            let offset = self.offset;
100            self.advance_while(|&c| c.is_xml_whitespace());
101
102            match self.next() {
103                Some('<') => {
104                    if self.offset > offset {
105                        tree.get_mut(parent_id)
106                            .append(whitespace(&self.input[offset..self.offset], offset));
107                    }
108                    if self.starts_with("</") {
109                        self.advance(2);
110                        self.advance_while(|&c| c != '>');
111                        self.advance(1);
112                        break;
113                    }
114                    self.advance(1);
115                    match self.next() {
116                        Some('?') => {
117                            self.advance(1);
118                            self.advance_until("?>");
119                        }
120                        Some('!') => {
121                            self.advance(1);
122                            match self.next() {
123                                Some('-') => {
124                                    self.advance(2);
125                                    self.advance_until("-->");
126                                }
127                                Some('[') => {
128                                    self.advance(1);
129                                    self.advance_until("]]>");
130                                }
131                                _ => {
132                                    self.advance_while(|&c| c != '>');
133                                    self.advance(1);
134                                }
135                            }
136                        }
137                        _ => self.parse_element(tree, parent_id),
138                    }
139                }
140                Some(..) => {
141                    self.advance_while(|&c| c != '<');
142                    tree.get_mut(parent_id)
143                        .append(text(&self.input[offset..self.offset], offset));
144                }
145                None => break,
146            }
147        }
148    }
149
150    pub fn parse(&mut self) -> XmlTree {
151        let mut tree = XmlTree::new();
152        self.parse_nodes(&mut tree, NodeId::from_index(0));
153        tree
154    }
155}
156
157pub trait XmlExt {
158    fn is_xml_whitespace(&self) -> bool;
159}
160
161impl XmlExt for char {
162    fn is_xml_whitespace(&self) -> bool {
163        matches!(self, ' ' | '\t' | '\n' | '\r')
164    }
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    #[test]
172    fn test_simple_element() {
173        let text = "<a/>";
174        let xml = XmlParser::new(text).parse();
175        let n = xml.root().first_child().unwrap();
176        assert_eq!(n.offset(), 0);
177        assert_eq!(n.tag_name(), Some("a"));
178    }
179
180    #[test]
181    fn test_attributes() {
182        let text = r#"<a b="c" d='e"'/>"#;
183        let xml = XmlParser::new(text).parse();
184        let n = xml.root().first_child().unwrap();
185        assert_eq!(n.attribute("b"), Some("c"));
186        assert_eq!(n.attribute("d"), Some("e\""));
187    }
188
189    #[test]
190    fn test_text() {
191        let text = "<a>bcd</a>";
192        let xml = XmlParser::new(text).parse();
193        let child = xml.root().first_child().unwrap().children().next();
194        assert_eq!(child.map(|c| c.offset()), Some(3));
195        assert_eq!(child.map(|c| c.text()), Some("bcd".to_string()));
196    }
197
198    #[test]
199    fn test_inbetween_space() {
200        let text = "<a><b>x</b> <c>y</c></a>";
201        let xml = XmlParser::new(text).parse();
202        let child = xml.root().first_child().unwrap().children().nth(1);
203        assert_eq!(child.map(|c| c.text()), Some(" ".to_string()));
204    }
205
206    #[test]
207    fn test_central_space() {
208        let text = "<a><b> </b></a>";
209        let xml = XmlParser::new(text).parse();
210        assert_eq!(xml.root().text(), " ");
211    }
212}