Skip to main content

cadmus_core/
helpers.rs

1use anyhow::{Context, Error};
2use entities::ENTITIES;
3use fxhash::FxHashMap;
4use lazy_static::lazy_static;
5use serde::de::{self, Visitor};
6use serde::{Deserialize, Deserializer, Serialize, Serializer};
7use std::borrow::Cow;
8use std::char;
9use std::fmt;
10use std::fs::{self, File};
11use std::io::{self, BufReader, BufWriter};
12use std::ops::Deref;
13use std::path::{Component, Path, PathBuf};
14use std::str::FromStr;
15use walkdir::DirEntry;
16
17lazy_static! {
18    pub static ref CHARACTER_ENTITIES: FxHashMap<&'static str, &'static str> = {
19        let mut m = FxHashMap::default();
20        for e in ENTITIES.iter() {
21            m.insert(e.entity, e.characters);
22        }
23        m
24    };
25}
26
27pub fn decode_entities(text: &str) -> Cow<'_, str> {
28    if text.find('&').is_none() {
29        return Cow::Borrowed(text);
30    }
31
32    let mut cursor = text;
33    let mut buf = String::with_capacity(text.len());
34
35    while let Some(start_index) = cursor.find('&') {
36        buf.push_str(&cursor[..start_index]);
37        cursor = &cursor[start_index..];
38        if let Some(end_index) = cursor.find(';') {
39            if let Some(repl) = CHARACTER_ENTITIES.get(&cursor[..=end_index]) {
40                buf.push_str(repl);
41            } else if cursor[1..].starts_with('#') {
42                let radix = if cursor[2..].starts_with('x') { 16 } else { 10 };
43                let drift_index = 2 + radix as usize / 16;
44                if let Some(ch) = u32::from_str_radix(&cursor[drift_index..end_index], radix)
45                    .ok()
46                    .and_then(char::from_u32)
47                {
48                    buf.push(ch);
49                } else {
50                    buf.push_str(&cursor[..=end_index]);
51                }
52            } else {
53                buf.push_str(&cursor[..=end_index]);
54            }
55            cursor = &cursor[end_index + 1..];
56        } else {
57            break;
58        }
59    }
60
61    buf.push_str(cursor);
62    Cow::Owned(buf)
63}
64
65pub fn load_json<T, P: AsRef<Path>>(path: P) -> Result<T, Error>
66where
67    for<'a> T: Deserialize<'a>,
68{
69    let file = File::open(path.as_ref())
70        .with_context(|| format!("can't open file {}", path.as_ref().display()))?;
71    let reader = BufReader::new(file);
72    serde_json::from_reader(reader)
73        .with_context(|| format!("can't parse JSON from {}", path.as_ref().display()))
74        .map_err(Into::into)
75}
76
77pub fn save_json<T, P: AsRef<Path>>(data: &T, path: P) -> Result<(), Error>
78where
79    T: Serialize,
80{
81    let file = File::create(path.as_ref())
82        .with_context(|| format!("can't create file {}", path.as_ref().display()))?;
83    let writer = BufWriter::new(file);
84    serde_json::to_writer_pretty(writer, data)
85        .with_context(|| format!("can't serialize to JSON file {}", path.as_ref().display()))
86        .map_err(Into::into)
87}
88
89pub fn load_toml<T, P: AsRef<Path>>(path: P) -> Result<T, Error>
90where
91    for<'a> T: Deserialize<'a>,
92{
93    let s = fs::read_to_string(path.as_ref())
94        .with_context(|| format!("can't read file {}", path.as_ref().display()))?;
95    toml::from_str(&s)
96        .with_context(|| format!("can't parse TOML content from {}", path.as_ref().display()))
97        .map_err(Into::into)
98}
99
100#[cfg_attr(feature = "tracing", tracing::instrument(skip(data, path), fields(file_path = %path.as_ref().display())))]
101pub fn save_toml<T, P: AsRef<Path>>(data: &T, path: P) -> Result<(), Error>
102where
103    T: Serialize,
104{
105    let path_ref = path.as_ref();
106    tracing::debug!(file_path = %path_ref.display(), "serializing data to TOML");
107    let s = toml::to_string(data).context("can't convert to TOML format")?;
108
109    tracing::debug!(
110        file_path = %path_ref.display(),
111        toml_size = s.len(),
112        "writing TOML to file"
113    );
114
115    match fs::write(path_ref, &s) {
116        Ok(()) => {
117            let file_size = path_ref.metadata().ok().map(|m| m.len());
118
119            tracing::debug!(
120                file_path = %path_ref.display(),
121                file_size = ?file_size,
122                "successfully wrote TOML file"
123            );
124
125            Ok(())
126        }
127        Err(e) => {
128            tracing::error!(
129                file_path = %path_ref.display(),
130                error = %e,
131                "failed to write TOML file"
132            );
133            Err(anyhow::Error::new(e))
134                .context(format!("can't write to file {}", path_ref.display()))
135        }
136    }
137}
138
139/// Computes a content-based fingerprint for a file.
140///
141/// Implemented on [`Path`] to hash the full file contents using BLAKE3,
142/// producing a stable 32-byte digest that is independent of filesystem
143/// metadata such as modification time or file size.
144///
145/// # Hashing strategy
146///
147/// The implementation selects between two BLAKE3 strategies based on file size:
148///
149/// - **< 10 MiB** — [`update_reader`](blake3::Hasher::update_reader): plain
150///   buffered sequential read. Avoids mmap syscall overhead for small files.
151///   On slow storage, a single sequential `read()` into a buffer is faster
152///   than taking page faults through a memory mapping. The typical e-book
153///   (100 KiB–500 KiB) falls into this range.
154///
155/// - **≥ 10 MiB** — [`update_mmap`](blake3::Hasher::update_mmap):
156///   single-threaded memory-mapped hashing. Avoids buffered-read overhead for
157///   large files while keeping CPU usage on a single core.
158pub trait Fingerprint {
159    fn fingerprint(&self) -> io::Result<Fp>;
160}
161
162const MMAP_THRESHOLD: u64 = 10 * 1024 * 1024;
163
164impl Fingerprint for Path {
165    #[cfg_attr(feature = "tracing", tracing::instrument(ret(level=tracing::Level::TRACE)))]
166    fn fingerprint(&self) -> io::Result<Fp> {
167        let mut hasher = blake3::Hasher::new();
168        if std::fs::metadata(self)?.len() >= MMAP_THRESHOLD {
169            hasher.update_mmap(self)?;
170        } else {
171            let file = std::fs::File::open(self)?;
172            hasher.update_reader(file)?;
173        }
174        Ok(Fp(*hasher.finalize().as_bytes()))
175    }
176}
177
178/// A 32-byte BLAKE3 content fingerprint used as the primary key for books.
179///
180/// Serialized as a 64-character lowercase hex string (e.g.
181/// `"af1349b9f5f9a1a6a0404dea36dcc949..."`).
182#[derive(Debug, Copy, Clone, Hash, Eq, PartialEq)]
183pub struct Fp([u8; 32]);
184
185impl Fp {
186    /// Constructs an `Fp` from a `u64` seed for use in tests.
187    ///
188    /// The seed is written into the last 8 bytes (big-endian); the remaining
189    /// 24 bytes are zero. This guarantees uniqueness for distinct seeds while
190    /// producing a valid 32-byte fingerprint.
191    #[cfg(test)]
192    pub fn from_u64(seed: u64) -> Self {
193        let mut bytes = [0u8; 32];
194        bytes[24..].copy_from_slice(&seed.to_be_bytes());
195        Fp(bytes)
196    }
197
198    /// Parses a legacy 16-character uppercase hex fingerprint (mtime + size
199    /// metadata format) into an `Fp`.
200    ///
201    /// The `u64` value is stored in the last 8 bytes (big-endian); the
202    /// remaining 24 bytes are zero. This matches the `from_u64` layout so
203    /// that legacy entries round-trip consistently. V2 migration will
204    /// re-key these to real BLAKE3 hashes once the files are found on disk.
205    pub(crate) fn from_legacy_str(s: &str) -> Result<Self, FpParseError> {
206        if s.len() != 16 {
207            return Err(FpParseError);
208        }
209        let seed = u64::from_str_radix(s, 16).map_err(|_| FpParseError)?;
210        let mut bytes = [0u8; 32];
211        bytes[24..].copy_from_slice(&seed.to_be_bytes());
212        Ok(Fp(bytes))
213    }
214}
215
216impl Deref for Fp {
217    type Target = [u8; 32];
218
219    fn deref(&self) -> &Self::Target {
220        &self.0
221    }
222}
223
224/// Error returned when a hex string cannot be decoded into an [`Fp`].
225#[derive(Debug)]
226pub struct FpParseError;
227
228impl fmt::Display for FpParseError {
229    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
230        f.write_str(
231            "invalid fingerprint: expected 64 hex characters or 16 hex characters (legacy format)",
232        )
233    }
234}
235
236impl std::error::Error for FpParseError {}
237
238impl FromStr for Fp {
239    type Err = FpParseError;
240
241    fn from_str(s: &str) -> Result<Self, Self::Err> {
242        if !s.is_ascii() {
243            return Err(FpParseError);
244        }
245
246        if s.len() == 16 {
247            return Self::from_legacy_str(s);
248        }
249
250        if s.len() != 64 {
251            return Err(FpParseError);
252        }
253
254        let mut bytes = [0u8; 32];
255        for (i, byte) in bytes.iter_mut().enumerate() {
256            *byte = u8::from_str_radix(&s[i * 2..i * 2 + 2], 16).map_err(|_| FpParseError)?;
257        }
258
259        Ok(Fp(bytes))
260    }
261}
262
263impl fmt::Display for Fp {
264    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
265        for byte in &self.0 {
266            write!(f, "{:02x}", byte)?;
267        }
268        Ok(())
269    }
270}
271
272impl sqlx::Type<sqlx::Sqlite> for Fp {
273    fn type_info() -> sqlx::sqlite::SqliteTypeInfo {
274        <String as sqlx::Type<sqlx::Sqlite>>::type_info()
275    }
276
277    fn compatible(ty: &sqlx::sqlite::SqliteTypeInfo) -> bool {
278        <String as sqlx::Type<sqlx::Sqlite>>::compatible(ty)
279    }
280}
281
282impl<'q> sqlx::Encode<'q, sqlx::Sqlite> for Fp {
283    fn encode_by_ref(
284        &self,
285        buf: &mut Vec<sqlx::sqlite::SqliteArgumentValue<'q>>,
286    ) -> Result<sqlx::encode::IsNull, sqlx::error::BoxDynError> {
287        self.to_string().encode_by_ref(buf)
288    }
289}
290
291impl<'r> sqlx::Decode<'r, sqlx::Sqlite> for Fp {
292    fn decode(value: sqlx::sqlite::SqliteValueRef<'r>) -> Result<Self, sqlx::error::BoxDynError> {
293        let s = <String as sqlx::Decode<'r, sqlx::Sqlite>>::decode(value)?;
294        s.parse().map_err(|e: FpParseError| e.to_string().into())
295    }
296}
297
298impl Serialize for Fp {
299    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
300    where
301        S: Serializer,
302    {
303        serializer.serialize_str(&self.to_string())
304    }
305}
306
307struct FpVisitor;
308
309impl<'de> Visitor<'de> for FpVisitor {
310    type Value = Fp;
311
312    fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
313        formatter.write_str("a 64-character hex string or a 16-character legacy hex string")
314    }
315
316    fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
317    where
318        E: de::Error,
319    {
320        Fp::from_str(value)
321            .or_else(|_| Fp::from_legacy_str(value))
322            .map_err(|e| E::custom(format!("can't parse fingerprint: {}", e)))
323    }
324}
325
326impl<'de> Deserialize<'de> for Fp {
327    fn deserialize<D>(deserializer: D) -> Result<Fp, D::Error>
328    where
329        D: Deserializer<'de>,
330    {
331        deserializer.deserialize_str(FpVisitor)
332    }
333}
334
335pub trait Normalize: ToOwned {
336    fn normalize(&self) -> Self::Owned;
337}
338
339impl Normalize for Path {
340    fn normalize(&self) -> PathBuf {
341        let mut result = PathBuf::default();
342
343        for c in self.components() {
344            match c {
345                Component::ParentDir => {
346                    result.pop();
347                }
348                Component::CurDir => (),
349                _ => result.push(c),
350            }
351        }
352
353        result
354    }
355}
356
357pub trait AsciiExtension {
358    fn to_alphabetic_digit(self) -> Option<u32>;
359}
360
361impl AsciiExtension for char {
362    fn to_alphabetic_digit(self) -> Option<u32> {
363        if self.is_ascii_uppercase() {
364            Some(self as u32 - 65)
365        } else {
366            None
367        }
368    }
369}
370
371pub mod datetime_format {
372    use chrono::NaiveDateTime;
373    use serde::{self, Deserialize, Deserializer, Serializer};
374
375    pub const FORMAT: &str = "%Y-%m-%d %H:%M:%S";
376
377    pub fn serialize<S>(date: &NaiveDateTime, serializer: S) -> Result<S::Ok, S::Error>
378    where
379        S: Serializer,
380    {
381        let s = format!("{}", date.format(FORMAT));
382        serializer.serialize_str(&s)
383    }
384
385    pub fn deserialize<'de, D>(deserializer: D) -> Result<NaiveDateTime, D::Error>
386    where
387        D: Deserializer<'de>,
388    {
389        let s = String::deserialize(deserializer)?;
390        NaiveDateTime::parse_from_str(&s, FORMAT).map_err(serde::de::Error::custom)
391    }
392}
393
394pub trait IsHidden {
395    fn is_hidden(&self) -> bool;
396}
397
398impl IsHidden for DirEntry {
399    fn is_hidden(&self) -> bool {
400        self.file_name()
401            .to_str()
402            .map_or(false, |s| s.starts_with('.'))
403    }
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use std::str::FromStr;
410
411    #[test]
412    fn test_entities() {
413        assert_eq!(decode_entities("a &amp b"), "a &amp b");
414        assert_eq!(decode_entities("a &zZz; b"), "a &zZz; b");
415        assert_eq!(decode_entities("a &amp; b"), "a & b");
416        assert_eq!(decode_entities("a &#x003E; b"), "a > b");
417        assert_eq!(decode_entities("a &#38; b"), "a & b");
418        assert_eq!(decode_entities("a &lt; b &gt; c"), "a < b > c");
419    }
420
421    #[test]
422    fn fp_from_str_rejects_non_ascii_input() {
423        let invalid = format!("a€{}", "0".repeat(60));
424
425        assert_eq!(invalid.len(), 64);
426        assert!(Fp::from_str(&invalid).is_err());
427    }
428
429    #[test]
430    fn fp_from_str_parses_valid_legacy_hex() {
431        let input = "0123456789ABCDEF";
432        let fp = Fp::from_str(input).expect("legacy fingerprint should parse");
433
434        assert_eq!(
435            fp.to_string(),
436            "0000000000000000000000000000000000000000000000000123456789abcdef"
437        );
438    }
439
440    #[test]
441    fn fp_from_str_rejects_invalid_legacy_hex() {
442        assert!(Fp::from_str("0123456789ABCDEG").is_err());
443    }
444
445    #[test]
446    fn fp_from_str_parses_valid_hex() {
447        let input = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
448        let fp = Fp::from_str(input).expect("valid fingerprint should parse");
449
450        assert_eq!(fp.to_string(), input);
451    }
452
453    #[test]
454    fn fp_from_str_accepts_uppercase_hex() {
455        let input = "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
456        let fp = Fp::from_str(input).expect("uppercase fingerprint should parse");
457
458        assert_eq!(
459            fp.to_string(),
460            "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
461        );
462    }
463}