1pub mod css;
20pub mod dom;
21pub mod engine;
22pub mod html5;
23pub mod layout;
24pub mod parse;
25pub mod style;
26pub mod xml;
27
28pub use html5::Html5Document;
29
30use self::css::CssParser;
31use self::dom::{NodeRef, XmlTree};
32use self::engine::{Engine, Page, ResourceFetcher};
33use self::layout::{DrawCommand, ImageCommand, TextAlign, TextCommand};
34use self::layout::{DrawState, LoopContext, RootData, StyleData};
35use self::style::StyleSheet;
36use self::xml::XmlParser;
37use crate::document::{BoundedText, Document, Location, TextLocation, TocEntry};
38use crate::framebuffer::Pixmap;
39use crate::geom::{Boundary, CycleDir, Edge};
40use crate::helpers::{decode_entities, Normalize};
41use crate::unit::pt_to_px;
42use anyhow::Error;
43use fxhash::FxHashMap;
44use std::fs::{self, File};
45use std::io::{Read, Write};
46use std::path::{Path, PathBuf};
47
48const VIEWER_STYLESHEET: &str = "css/html.css";
50const USER_STYLESHEET: &str = "css/html-user.css";
52
53type UriCache = FxHashMap<String, usize>;
56
57pub(crate) struct HtmlBase {
65 pub(crate) content: XmlTree,
67 pub(crate) engine: Engine,
69 pub(crate) pages: Vec<Page>,
71 pub(crate) parent: PathBuf,
73 pub(crate) size: usize,
75 pub(crate) viewer_stylesheet: PathBuf,
77 pub(crate) user_stylesheet: PathBuf,
79 pub(crate) ignore_document_css: bool,
82}
83
84impl HtmlBase {
85 pub(crate) fn new(
87 content: XmlTree,
88 size: usize,
89 parent: PathBuf,
90 viewer_stylesheet: PathBuf,
91 user_stylesheet: PathBuf,
92 ) -> Self {
93 HtmlBase {
94 content,
95 engine: Engine::new(),
96 pages: Vec::new(),
97 parent,
98 size,
99 viewer_stylesheet,
100 user_stylesheet,
101 ignore_document_css: false,
102 }
103 }
104
105 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(offset)))]
111 pub(crate) fn page_index(&mut self, offset: usize) -> Option<usize> {
112 if self.pages.is_empty() {
113 self.pages = self.build_pages();
114 }
115 if self.pages.len() < 2
116 || self.pages[1].first().map(|dc| offset < dc.offset()) == Some(true)
117 {
118 return Some(0);
119 } else if self.pages[self.pages.len() - 1]
120 .first()
121 .map(|dc| offset >= dc.offset())
122 == Some(true)
123 {
124 return Some(self.pages.len() - 1);
125 } else {
126 for i in 1..self.pages.len() - 1 {
127 if self.pages[i].first().map(|dc| offset >= dc.offset()) == Some(true)
128 && self.pages[i + 1].first().map(|dc| offset < dc.offset()) == Some(true)
129 {
130 return Some(i);
131 }
132 }
133 }
134 None
135 }
136
137 #[cfg_attr(
144 feature = "tracing",
145 tracing::instrument(skip(self, cache), fields(uri))
146 )]
147 fn resolve_link(&mut self, uri: &str, cache: &mut UriCache) -> Option<usize> {
148 let frag_index = uri.find('#')?;
149 let name = &uri[..frag_index];
150 let content = self.content.clone();
151 self.cache_uris(content.root(), name, cache);
152 cache.get(uri).cloned()
153 }
154
155 fn cache_uris(&mut self, node: NodeRef, name: &str, cache: &mut UriCache) {
159 if let Some(id) = node.attribute("id") {
160 cache.insert(format!("{}#{}", name, id), node.offset());
161 }
162 for child in node.children() {
163 self.cache_uris(child, name, cache);
164 }
165 }
166
167 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self)))]
181 pub(crate) fn build_pages(&mut self) -> Vec<Page> {
182 let mut stylesheet = StyleSheet::new();
183 let spine_dir = PathBuf::default();
184
185 if let Ok(text) = fs::read_to_string(VIEWER_STYLESHEET) {
186 let mut css = CssParser::new(&text).parse();
187 stylesheet.append(&mut css, true);
188 }
189
190 if self.viewer_stylesheet != Path::new(VIEWER_STYLESHEET) {
191 if let Ok(text) = fs::read_to_string(&self.viewer_stylesheet) {
192 let mut css = CssParser::new(&text).parse();
193 stylesheet.append(&mut css, true);
194 }
195 }
196
197 if let Ok(text) = fs::read_to_string(&self.user_stylesheet) {
198 let mut css = CssParser::new(&text).parse();
199 stylesheet.append(&mut css, true);
200 }
201
202 if !self.ignore_document_css {
203 let mut inner_css = StyleSheet::new();
204
205 if let Some(head) = self.content.root().find("head") {
206 for child in head.children() {
207 if child.tag_name() == Some("link")
208 && child.attribute("rel") == Some("stylesheet")
209 {
210 if let Some(href) = child.attribute("href") {
211 if let Some(name) = spine_dir.join(href).normalize().to_str() {
212 if let Ok(buf) = self.parent.fetch(name) {
213 if let Ok(text) = String::from_utf8(buf) {
214 let mut css = CssParser::new(&text).parse();
215 inner_css.append(&mut css, false);
216 }
217 }
218 }
219 }
220 } else if child.tag_name() == Some("style")
221 && child.attribute("type") == Some("text/css")
222 {
223 let mut css = CssParser::new(&child.text()).parse();
224 inner_css.append(&mut css, false);
225 }
226 }
227 }
228
229 stylesheet.append(&mut inner_css, true);
230 }
231
232 let mut pages = Vec::new();
233
234 let mut rect = self.engine.rect();
235 rect.shrink(&self.engine.margin);
236
237 let language = self
238 .content
239 .root()
240 .find("html")
241 .and_then(|html| html.attribute("xml:lang"))
242 .map(String::from);
243
244 let style = StyleData {
245 language,
246 font_size: self.engine.font_size,
247 line_height: pt_to_px(
248 self.engine.line_height * self.engine.font_size,
249 self.engine.dpi,
250 )
251 .round() as i32,
252 text_align: self.engine.text_align,
253 start_x: rect.min.x,
254 end_x: rect.max.x,
255 width: rect.max.x - rect.min.x,
256 ..Default::default()
257 };
258
259 let loop_context = LoopContext::default();
260 let mut draw_state = DrawState {
261 position: rect.min,
262 ..Default::default()
263 };
264
265 let root_data = RootData {
266 start_offset: 0,
267 spine_dir,
268 rect,
269 };
270
271 pages.push(Vec::new());
272
273 self.engine.build_display_list(
274 self.content.root(),
275 &style,
276 &loop_context,
277 &stylesheet,
278 &root_data,
279 &mut self.parent,
280 &mut draw_state,
281 &mut pages,
282 );
283
284 pages.retain(|page| !page.is_empty());
285
286 if pages.is_empty() {
287 pages.push(vec![DrawCommand::Marker(self.content.root().offset())]);
288 }
289
290 pages
291 }
292
293 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(loc = ?loc)))]
304 pub(crate) fn resolve_location(&mut self, loc: Location) -> Option<usize> {
305 self.engine.load_fonts();
306
307 match loc {
308 Location::Exact(offset) => {
309 let page_index = self.page_index(offset)?;
310 self.pages[page_index].first().map(DrawCommand::offset)
311 }
312 Location::Previous(offset) => {
313 let page_index = self.page_index(offset)?;
314 if page_index > 0 {
315 self.pages[page_index - 1].first().map(DrawCommand::offset)
316 } else {
317 None
318 }
319 }
320 Location::Next(offset) => {
321 let page_index = self.page_index(offset)?;
322 if page_index < self.pages.len() - 1 {
323 self.pages[page_index + 1].first().map(DrawCommand::offset)
324 } else {
325 None
326 }
327 }
328 Location::LocalUri(_, ref uri) | Location::Uri(ref uri) => {
329 let mut cache = FxHashMap::default();
330 self.resolve_link(uri, &mut cache)
331 }
332 }
333 }
334
335 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(loc = ?loc)))]
338 pub(crate) fn words(&mut self, loc: Location) -> Option<(Vec<BoundedText>, usize)> {
339 let offset = self.resolve_location(loc)?;
340 let page_index = self.page_index(offset)?;
341
342 Some((
343 self.pages[page_index]
344 .iter()
345 .filter_map(|dc| match dc {
346 DrawCommand::Text(TextCommand {
347 text, rect, offset, ..
348 }) => Some(BoundedText {
349 text: text.clone(),
350 rect: (*rect).into(),
351 location: TextLocation::Dynamic(*offset),
352 }),
353 _ => None,
354 })
355 .collect(),
356 offset,
357 ))
358 }
359
360 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(loc = ?loc)))]
363 pub(crate) fn images(&mut self, loc: Location) -> Option<(Vec<Boundary>, usize)> {
364 let offset = self.resolve_location(loc)?;
365 let page_index = self.page_index(offset)?;
366
367 Some((
368 self.pages[page_index]
369 .iter()
370 .filter_map(|dc| match dc {
371 DrawCommand::Image(ImageCommand { rect, .. }) => Some((*rect).into()),
372 _ => None,
373 })
374 .collect(),
375 offset,
376 ))
377 }
378
379 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(loc = ?loc)))]
384 pub(crate) fn links(&mut self, loc: Location) -> Option<(Vec<BoundedText>, usize)> {
385 let offset = self.resolve_location(loc)?;
386 let page_index = self.page_index(offset)?;
387
388 Some((
389 self.pages[page_index]
390 .iter()
391 .filter_map(|dc| match dc {
392 DrawCommand::Text(TextCommand {
393 uri, rect, offset, ..
394 })
395 | DrawCommand::Image(ImageCommand {
396 uri, rect, offset, ..
397 }) if uri.is_some() => Some(BoundedText {
398 text: uri.clone().unwrap(),
399 rect: (*rect).into(),
400 location: TextLocation::Dynamic(*offset),
401 }),
402 _ => None,
403 })
404 .collect(),
405 offset,
406 ))
407 }
408
409 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(loc = ?loc, scale, samples)))]
412 pub(crate) fn pixmap(
413 &mut self,
414 loc: Location,
415 scale: f32,
416 samples: usize,
417 ) -> Option<(Pixmap, usize)> {
418 let offset = self.resolve_location(loc)?;
419 let page_index = self.page_index(offset)?;
420 let page = self.pages[page_index].clone();
421 let pixmap = self
422 .engine
423 .render_page(&page, scale, samples, &mut self.parent)?;
424
425 Some((pixmap, offset))
426 }
427
428 pub(crate) fn metadata(&self, key: &str) -> Option<String> {
431 self.content
432 .root()
433 .find("head")
434 .and_then(|head| {
435 head.children().find(|child| {
436 child.tag_name() == Some("meta") && child.attribute("name") == Some(key)
437 })
438 })
439 .and_then(|child| {
440 child
441 .attribute("content")
442 .map(|s| decode_entities(s).into_owned())
443 })
444 }
445}
446
447impl ResourceFetcher for PathBuf {
450 fn fetch(&mut self, name: &str) -> Result<Vec<u8>, Error> {
451 let mut file = File::open(self.join(name))?;
452 let mut buf = Vec::new();
453 file.read_to_end(&mut buf)?;
454 Ok(buf)
455 }
456}
457
458pub struct HtmlDocument {
464 text: String,
467 base: HtmlBase,
469}
470
471unsafe impl Send for HtmlDocument {}
472unsafe impl Sync for HtmlDocument {}
473
474impl HtmlDocument {
475 #[cfg_attr(feature = "tracing", tracing::instrument(skip(path), fields(path = %path.as_ref().display())))]
478 pub fn new<P: AsRef<Path>>(path: P) -> Result<HtmlDocument, Error> {
479 let mut file = File::open(&path)?;
480 let size = file.metadata()?.len() as usize;
481 let mut text = String::new();
482 file.read_to_string(&mut text)?;
483 let mut content = XmlParser::new(&text).parse();
484 content.wrap_lost_inlines();
485 let parent = path.as_ref().parent().unwrap_or_else(|| Path::new(""));
486
487 Ok(HtmlDocument {
488 text,
489 base: HtmlBase::new(
490 content,
491 size,
492 parent.to_path_buf(),
493 PathBuf::from(VIEWER_STYLESHEET),
494 PathBuf::from(USER_STYLESHEET),
495 ),
496 })
497 }
498
499 #[cfg_attr(feature = "tracing", tracing::instrument(skip(text), fields(len = text.len())))]
504 pub fn new_from_memory(text: &str) -> HtmlDocument {
505 let size = text.len();
506 let mut content = XmlParser::new(text).parse();
507 content.wrap_lost_inlines();
508
509 HtmlDocument {
510 text: text.to_string(),
511 base: HtmlBase::new(
512 content,
513 size,
514 PathBuf::default(),
515 PathBuf::from(VIEWER_STYLESHEET),
516 PathBuf::from(USER_STYLESHEET),
517 ),
518 }
519 }
520
521 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self, text), fields(len = text.len())))]
524 pub fn update(&mut self, text: &str) {
525 self.base.size = text.len();
526 self.base.content = XmlParser::new(text).parse();
527 self.base.content.wrap_lost_inlines();
528 self.text = text.to_string();
529 self.base.pages.clear();
530 }
531
532 pub fn set_margin(&mut self, margin: &Edge) {
534 self.base.engine.set_margin(margin);
535 self.base.pages.clear();
536 }
537
538 pub fn set_font_size(&mut self, font_size: f32) {
540 self.base.engine.set_font_size(font_size);
541 self.base.pages.clear();
542 }
543
544 pub fn set_viewer_stylesheet<P: AsRef<Path>>(&mut self, path: P) {
546 self.base.viewer_stylesheet = path.as_ref().to_path_buf();
547 self.base.pages.clear();
548 }
549
550 pub fn set_user_stylesheet<P: AsRef<Path>>(&mut self, path: P) {
552 self.base.user_stylesheet = path.as_ref().to_path_buf();
553 self.base.pages.clear();
554 }
555
556 pub fn categories(&self) -> Option<String> {
558 None
559 }
560
561 pub fn description(&self) -> Option<String> {
563 self.base.metadata("description")
564 }
565
566 pub fn language(&self) -> Option<String> {
569 self.base
570 .content
571 .root()
572 .find("html")
573 .and_then(|html| html.attribute("xml:lang"))
574 .map(String::from)
575 }
576
577 pub fn year(&self) -> Option<String> {
580 self.base
581 .metadata("date")
582 .map(|s| s.chars().take(4).collect())
583 }
584}
585
586impl Document for HtmlDocument {
587 #[inline]
588 fn dims(&self, _index: usize) -> Option<(f32, f32)> {
589 Some((
590 self.base.engine.dims.0 as f32,
591 self.base.engine.dims.1 as f32,
592 ))
593 }
594
595 fn pages_count(&self) -> usize {
596 self.base.size
597 }
598
599 fn toc(&mut self) -> Option<Vec<TocEntry>> {
600 None
601 }
602
603 fn chapter<'a>(&mut self, _offset: usize, _toc: &'a [TocEntry]) -> Option<(&'a TocEntry, f32)> {
604 None
605 }
606
607 fn chapter_relative<'a>(
608 &mut self,
609 _offset: usize,
610 _dir: CycleDir,
611 _toc: &'a [TocEntry],
612 ) -> Option<&'a TocEntry> {
613 None
614 }
615
616 fn resolve_location(&mut self, loc: Location) -> Option<usize> {
617 self.base.resolve_location(loc)
618 }
619
620 fn words(&mut self, loc: Location) -> Option<(Vec<BoundedText>, usize)> {
621 self.base.words(loc)
622 }
623
624 fn lines(&mut self, _loc: Location) -> Option<(Vec<BoundedText>, usize)> {
625 None
626 }
627
628 fn images(&mut self, loc: Location) -> Option<(Vec<Boundary>, usize)> {
629 self.base.images(loc)
630 }
631
632 fn links(&mut self, loc: Location) -> Option<(Vec<BoundedText>, usize)> {
633 self.base.links(loc)
634 }
635
636 fn pixmap(&mut self, loc: Location, scale: f32, samples: usize) -> Option<(Pixmap, usize)> {
637 self.base.pixmap(loc, scale, samples)
638 }
639
640 fn layout(&mut self, width: u32, height: u32, font_size: f32, dpi: u16) {
641 self.base.engine.layout(width, height, font_size, dpi);
642 self.base.pages.clear();
643 }
644
645 fn set_text_align(&mut self, text_align: TextAlign) {
646 self.base.engine.set_text_align(text_align);
647 self.base.pages.clear();
648 }
649
650 fn set_font_family(&mut self, family_name: &str, search_path: &str) {
651 self.base.engine.set_font_family(family_name, search_path);
652 self.base.pages.clear();
653 }
654
655 fn set_margin_width(&mut self, width: i32) {
656 self.base.engine.set_margin_width(width);
657 self.base.pages.clear();
658 }
659
660 fn set_line_height(&mut self, line_height: f32) {
661 self.base.engine.set_line_height(line_height);
662 self.base.pages.clear();
663 }
664
665 fn set_hyphen_penalty(&mut self, hyphen_penalty: i32) {
666 self.base.engine.set_hyphen_penalty(hyphen_penalty);
667 self.base.pages.clear();
668 }
669
670 fn set_stretch_tolerance(&mut self, stretch_tolerance: f32) {
671 self.base.engine.set_stretch_tolerance(stretch_tolerance);
672 self.base.pages.clear();
673 }
674
675 fn set_ignore_document_css(&mut self, ignore: bool) {
676 self.base.ignore_document_css = ignore;
677 self.base.pages.clear();
678 }
679
680 fn title(&self) -> Option<String> {
681 self.base
682 .content
683 .root()
684 .find("head")
685 .and_then(|head| {
686 head.children()
687 .find(|child| child.tag_name() == Some("title"))
688 })
689 .map(|child| decode_entities(&child.text()).into_owned())
690 }
691
692 fn author(&self) -> Option<String> {
693 self.base.metadata("author")
694 }
695
696 fn metadata(&self, key: &str) -> Option<String> {
697 self.base.metadata(key)
698 }
699
700 fn save(&self, path: &str) -> Result<(), Error> {
701 let mut file = File::create(path)?;
702 file.write_all(self.text.as_bytes()).map_err(Into::into)
703 }
704
705 fn is_reflowable(&self) -> bool {
706 true
707 }
708
709 fn has_synthetic_page_numbers(&self) -> bool {
710 true
711 }
712}
713
714#[cfg(test)]
715mod tests {
716 use super::*;
717 use crate::document::html::layout::DrawCommand;
718 use std::path::PathBuf;
719
720 fn setup_doc(html: &str) -> HtmlDocument {
721 let root_dir = PathBuf::from(
722 std::env::var("TEST_ROOT_DIR").expect("TEST_ROOT_DIR must be set for html tests"),
723 );
724 let mut doc = HtmlDocument::new_from_memory(html);
725 doc.base.engine.layout(600, 800, 12.0, 265);
726 doc.base.engine.set_margin_width(3);
727 doc.base.engine.load_fonts_from(root_dir);
728 doc
729 }
730
731 #[test]
732 fn nested_list_items_are_indented_further_than_outer_items() {
733 let html = r#"<ol><li>Outer item</li><ol style="list-style-type:lower-alpha"><li>Inner item</li></ol></ol>"#;
734 let mut doc = setup_doc(html);
735
736 let pages = doc.base.build_pages();
737 let all_commands: Vec<_> = pages.iter().flatten().collect();
738
739 let text_x_positions: Vec<i32> = all_commands
740 .iter()
741 .filter_map(|cmd| match cmd {
742 DrawCommand::Text(tc) => Some(tc.position.x),
743 DrawCommand::ExtraText(tc) => Some(tc.position.x),
744 _ => None,
745 })
746 .collect();
747
748 assert!(
749 text_x_positions.len() >= 2,
750 "expected at least two text items, got {}",
751 text_x_positions.len()
752 );
753
754 let min_x = text_x_positions.iter().copied().min().unwrap();
755 let max_x = text_x_positions.iter().copied().max().unwrap();
756
757 assert!(
758 max_x > min_x,
759 "inner list item (x={}) should be indented further than outer item (x={})",
760 max_x,
761 min_x
762 );
763 }
764}