cadmus_core/dictionary/
dictreader.rs1use std::ffi::OsStr;
13use std::fs::File;
14use std::io;
15use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
16use std::path::Path;
17
18use super::errors::DictError;
19use byteorder::*;
20
21pub static MAX_BYTES_FOR_BUFFER: u64 = 1_048_576; pub static GZ_FEXTRA: u8 = 0b0000_0100;
27pub static GZ_FNAME: u8 = 0b0000_1000; pub static GZ_COMMENT: u8 = 0b0001_0000; pub static GZ_FHCRC: u8 = 0b0000_0010;
33
34pub trait DictReader {
41 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError>;
43}
44
45pub struct DictReaderRaw<B: Read + Seek> {
49 dict_data: B,
50 total_length: u64,
51}
52
53impl<B: Read + Seek> DictReaderRaw<B> {
54 pub fn new(mut dict_data: B) -> Result<DictReaderRaw<B>, DictError> {
56 let end = dict_data.seek(SeekFrom::End(0))?;
57 Ok(DictReaderRaw {
58 dict_data,
59 total_length: end,
60 })
61 }
62}
63
64impl<B: Read + Seek> DictReader for DictReaderRaw<B> {
65 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
67 if length > MAX_BYTES_FOR_BUFFER {
68 return Err(DictError::MemoryError);
69 }
70
71 if (start_offset + length) > self.total_length {
72 return Err(DictError::IoError(io::Error::new(
73 io::ErrorKind::UnexpectedEof,
74 "a \
75 seek beyond the end of uncompressed data was requested",
76 )));
77 }
78
79 self.dict_data.seek(SeekFrom::Start(start_offset))?;
80 let mut read_data = vec![0; length as usize];
81 let bytes_read = self.dict_data.read(read_data.as_mut_slice())? as u64;
82 if bytes_read != length {
83 return Err(DictError::IoError(io::Error::new(
85 io::ErrorKind::UnexpectedEof,
86 "seek beyond end of file",
87 )));
88 }
89 Ok(String::from_utf8(read_data)?)
90 }
91}
92
93pub fn load_dict<P: AsRef<Path>>(path: P) -> Result<Box<dyn DictReader>, DictError> {
104 if path.as_ref().extension() == Some(OsStr::new("dz")) {
105 let reader = File::open(path)?;
106 Ok(Box::new(DictReaderDz::new(reader)?))
107 } else {
108 let reader = BufReader::new(File::open(path)?);
109 Ok(Box::new(DictReaderRaw::new(reader)?))
110 }
111}
112
113pub struct DictReaderDz<B: Read + Seek> {
119 dzdict: B,
121 uchunk_length: usize,
123 end_compressed_data: usize,
125 chunk_offsets: Vec<usize>,
127 ufile_length: u64, }
130
131#[derive(Debug)]
132struct Chunk {
134 offset: usize,
135 length: usize,
136}
137
138impl<B: Read + Seek> DictReaderDz<B> {
139 pub fn new(dzdict: B) -> Result<DictReaderDz<B>, DictError> {
141 let mut buffered_dzdict = BufReader::new(dzdict);
142 let mut header = vec![0u8; 12];
143 buffered_dzdict.read_exact(&mut header)?;
144 if header[0..2] != [0x1F, 0x8B] {
145 return Err(DictError::InvalidFileFormat(
146 "Not in gzip format".into(),
147 None,
148 ));
149 }
150
151 let flags = &header[3]; if (flags & GZ_FEXTRA) == 0 {
153 return Err(DictError::InvalidFileFormat(
155 "Extra flag (FLG.FEXTRA) \
156 not set, not in gzip + dzip format"
157 .into(),
158 None,
159 ));
160 }
161
162 let xlen = LittleEndian::read_u16(&header[10..12]);
164
165 let mut fextra = vec![0u8; xlen as usize];
167 buffered_dzdict.read_exact(&mut fextra)?;
168
169 if fextra[0..2] != [b'R', b'A'] {
170 return Err(DictError::InvalidFileFormat(
171 "No dictzip info found in FEXTRA \
172 header (behind XLEN, in SI1SI2 fields)"
173 .into(),
174 None,
175 ));
176 }
177
178 let length_subfield = LittleEndian::read_u16(&fextra[2..4]);
179 assert_eq!(
180 length_subfield,
181 xlen - 4,
182 "the length of the subfield \
183 should be the same as the fextra field, ignoring the \
184 additional length information and the file format identification"
185 );
186 let subf_version = LittleEndian::read_u16(&fextra[4..6]);
187 if subf_version != 1 {
188 return Err(DictError::InvalidFileFormat(
189 "Unimplemented dictzip \
190 version, only ver 1 supported"
191 .into(),
192 None,
193 ));
194 }
195
196 let uchunk_length = LittleEndian::read_u16(&fextra[6..8]);
199 let chunk_count = LittleEndian::read_u16(&fextra[8..10]);
201 if chunk_count == 0 {
202 return Err(DictError::InvalidFileFormat(
203 "No compressed chunks in \
204 file or broken header information"
205 .into(),
206 None,
207 ));
208 }
209
210 let numbers_chunks_which_would_fit = ((fextra.len() - 10) / 2) as u16; if numbers_chunks_which_would_fit != chunk_count {
216 return Err(DictError::InvalidFileFormat(
217 format!(
218 "Expected {} chunks \
219 according to dictzip header, but the FEXTRA field can \
220 accomodate {}; possibly broken file",
221 chunk_count, numbers_chunks_which_would_fit
222 ),
223 None,
224 ));
225 }
226
227 if (flags & GZ_FNAME) != 0 {
229 let mut tmp = Vec::new();
230 buffered_dzdict.read_until(b'\0', &mut tmp)?;
231 }
232
233 if (flags & GZ_COMMENT) != 0 {
235 let mut tmp = Vec::new();
236 buffered_dzdict.read_until(b'\0', &mut tmp)?;
237 }
238
239 if (flags & GZ_FHCRC) != 0 {
241 buffered_dzdict.seek(SeekFrom::Current(2))?;
242 }
243
244 let mut chunk_offsets = Vec::with_capacity(chunk_count as usize);
246 let mut end_compressed_data = buffered_dzdict.seek(SeekFrom::Current(0))? as usize;
248 let chunks_from_header = &fextra[10usize..(10 + chunk_count * 2) as usize];
250
251 for index in (0..chunks_from_header.len()).filter(|i| (i % 2) == 0) {
253 let index = index as usize;
254 let compressed_len =
255 LittleEndian::read_u16(&chunks_from_header[index..(index + 2)]) as usize;
256 chunk_offsets.push(end_compressed_data);
257 end_compressed_data += compressed_len;
258 }
259 assert_eq!(chunk_offsets.len() as u16, chunk_count, "The read number of compressed chunks in \
260 the .dz file must be equivalent to the number of chunks actually found in the file.\n");
261
262 buffered_dzdict.seek(SeekFrom::Start(end_compressed_data as u64))?;
264 let uncompressed = buffered_dzdict.read_i32::<LittleEndian>()?;
265
266 Ok(DictReaderDz {
267 dzdict: buffered_dzdict.into_inner(),
268 chunk_offsets,
269 end_compressed_data,
270 uchunk_length: uchunk_length as usize,
271 ufile_length: uncompressed as u64,
272 })
273 }
274
275 fn get_chunks_for(&self, start_offset: u64, length: u64) -> Vec<Chunk> {
276 let mut chunks = Vec::new();
277 let start_chunk = start_offset as usize / self.uchunk_length;
278 let end_chunk = (start_offset + length) as usize / self.uchunk_length;
279 for id in start_chunk..=end_chunk {
280 let chunk_length = match self.chunk_offsets.get(id + 1) {
281 Some(next) => next - self.chunk_offsets[id],
282 None => self.end_compressed_data - self.chunk_offsets[id],
283 };
284 chunks.push(Chunk {
285 offset: self.chunk_offsets[id],
286 length: chunk_length,
287 });
288 }
289
290 chunks
291 }
292
293 fn inflate(&self, data: Vec<u8>) -> Result<Vec<u8>, DictError> {
295 let mut decoder = flate2::Decompress::new(false);
296 let mut decoded = vec![0u8; self.uchunk_length];
297 decoder.decompress(
298 data.as_slice(),
299 decoded.as_mut_slice(),
300 flate2::FlushDecompress::None,
301 )?;
302 Ok(decoded)
303 }
304}
305
306impl<B: Read + Seek> DictReader for DictReaderDz<B> {
307 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
309 if length > MAX_BYTES_FOR_BUFFER {
310 return Err(DictError::MemoryError);
311 }
312 if (start_offset + length) > self.ufile_length {
313 return Err(DictError::IoError(io::Error::new(
314 io::ErrorKind::UnexpectedEof,
315 "a \
316 seek beyond the end of uncompressed data was requested",
317 )));
318 }
319 let mut data = Vec::new();
320 for chunk in self.get_chunks_for(start_offset, length) {
321 let pos = self.dzdict.seek(SeekFrom::Start(chunk.offset as u64))?;
322 if pos != (chunk.offset as u64) {
323 return Err(DictError::IoError(io::Error::new(
324 io::ErrorKind::Other,
325 format!(
326 "attempted to seek to {} but new position is {}",
327 chunk.offset, pos
328 ),
329 )));
330 }
331 let mut definition = vec![0u8; chunk.length];
332 self.dzdict.read_exact(&mut definition)?;
333 data.push(self.inflate(definition)?);
334 }
335
336 let cut_front = start_offset as usize % self.uchunk_length;
338 let data = match data.len() {
340 0 => panic!(),
341 1 => data[0][cut_front..cut_front + length as usize].to_vec(),
342 n => {
343 let mut tmp = data[0][cut_front..].to_vec();
344 for text in data.iter().skip(1).take(n - 2) {
346 tmp.extend_from_slice(text);
347 }
348 let remaining_bytes = (length as usize + cut_front) % self.uchunk_length;
350 tmp.extend_from_slice(&data[n - 1][..remaining_bytes]);
351 tmp
352 }
353 };
354 Ok(String::from_utf8(data)?)
355 }
356}