1use std::ffi::OsStr;
13use std::fs::File;
14use std::io;
15use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
16use std::path::Path;
17
18use super::errors::DictError;
19use byteorder::*;
20
21pub static MAX_BYTES_FOR_BUFFER: u64 = 1_048_576; pub static GZ_FEXTRA: u8 = 0b0000_0100;
27pub static GZ_FNAME: u8 = 0b0000_1000; pub static GZ_COMMENT: u8 = 0b0001_0000; pub static GZ_FHCRC: u8 = 0b0000_0010;
33
34pub trait DictReader {
41 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError>;
43}
44
45pub struct DictReaderRaw<B: Read + Seek> {
49 dict_data: B,
50 total_length: u64,
51}
52
53impl<B: Read + Seek> DictReaderRaw<B> {
54 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
56 pub fn new(mut dict_data: B) -> Result<DictReaderRaw<B>, DictError> {
57 let end = dict_data.seek(SeekFrom::End(0))?;
58 Ok(DictReaderRaw {
59 dict_data,
60 total_length: end,
61 })
62 }
63}
64
65impl<B: Read + Seek> DictReader for DictReaderRaw<B> {
66 #[cfg_attr(
68 feature = "tracing",
69 tracing::instrument(skip(self), fields(start_offset, length))
70 )]
71 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
72 if length > MAX_BYTES_FOR_BUFFER {
73 return Err(DictError::MemoryError);
74 }
75
76 if (start_offset + length) > self.total_length {
77 return Err(DictError::IoError(io::Error::new(
78 io::ErrorKind::UnexpectedEof,
79 "a \
80 seek beyond the end of uncompressed data was requested",
81 )));
82 }
83
84 self.dict_data.seek(SeekFrom::Start(start_offset))?;
85 let mut read_data = vec![0; length as usize];
86 let bytes_read = self.dict_data.read(read_data.as_mut_slice())? as u64;
87 if bytes_read != length {
88 return Err(DictError::IoError(io::Error::new(
90 io::ErrorKind::UnexpectedEof,
91 "seek beyond end of file",
92 )));
93 }
94 Ok(String::from_utf8(read_data)?)
95 }
96}
97
98#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
109pub fn load_dict<P: AsRef<Path>>(path: P) -> Result<Box<dyn DictReader>, DictError> {
110 if path.as_ref().extension() == Some(OsStr::new("dz")) {
111 let reader = File::open(path)?;
112 Ok(Box::new(DictReaderDz::new(reader)?))
113 } else {
114 let reader = BufReader::new(File::open(path)?);
115 Ok(Box::new(DictReaderRaw::new(reader)?))
116 }
117}
118
119pub struct DictReaderDz<B: Read + Seek> {
125 dzdict: B,
127 uchunk_length: usize,
129 end_compressed_data: usize,
131 chunk_offsets: Vec<usize>,
133 ufile_length: u64, }
136
137#[derive(Debug)]
138struct Chunk {
140 offset: usize,
141 length: usize,
142}
143
144impl<B: Read + Seek> DictReaderDz<B> {
145 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
147 pub fn new(dzdict: B) -> Result<DictReaderDz<B>, DictError> {
148 let mut buffered_dzdict = BufReader::new(dzdict);
149 let mut header = vec![0u8; 12];
150 buffered_dzdict.read_exact(&mut header)?;
151 if header[0..2] != [0x1F, 0x8B] {
152 return Err(DictError::InvalidFileFormat(
153 "Not in gzip format".into(),
154 None,
155 ));
156 }
157
158 let flags = &header[3]; if (flags & GZ_FEXTRA) == 0 {
160 return Err(DictError::InvalidFileFormat(
162 "Extra flag (FLG.FEXTRA) \
163 not set, not in gzip + dzip format"
164 .into(),
165 None,
166 ));
167 }
168
169 let xlen = LittleEndian::read_u16(&header[10..12]);
171
172 let mut fextra = vec![0u8; xlen as usize];
174 buffered_dzdict.read_exact(&mut fextra)?;
175
176 if fextra[0..2] != [b'R', b'A'] {
177 return Err(DictError::InvalidFileFormat(
178 "No dictzip info found in FEXTRA \
179 header (behind XLEN, in SI1SI2 fields)"
180 .into(),
181 None,
182 ));
183 }
184
185 let length_subfield = LittleEndian::read_u16(&fextra[2..4]);
186 assert_eq!(
187 length_subfield,
188 xlen - 4,
189 "the length of the subfield \
190 should be the same as the fextra field, ignoring the \
191 additional length information and the file format identification"
192 );
193 let subf_version = LittleEndian::read_u16(&fextra[4..6]);
194 if subf_version != 1 {
195 return Err(DictError::InvalidFileFormat(
196 "Unimplemented dictzip \
197 version, only ver 1 supported"
198 .into(),
199 None,
200 ));
201 }
202
203 let uchunk_length = LittleEndian::read_u16(&fextra[6..8]);
206 let chunk_count = LittleEndian::read_u16(&fextra[8..10]);
208 if chunk_count == 0 {
209 return Err(DictError::InvalidFileFormat(
210 "No compressed chunks in \
211 file or broken header information"
212 .into(),
213 None,
214 ));
215 }
216
217 let numbers_chunks_which_would_fit = ((fextra.len() - 10) / 2) as u16; if numbers_chunks_which_would_fit != chunk_count {
223 return Err(DictError::InvalidFileFormat(
224 format!(
225 "Expected {} chunks \
226 according to dictzip header, but the FEXTRA field can \
227 accomodate {}; possibly broken file",
228 chunk_count, numbers_chunks_which_would_fit
229 ),
230 None,
231 ));
232 }
233
234 if (flags & GZ_FNAME) != 0 {
236 let mut tmp = Vec::new();
237 buffered_dzdict.read_until(b'\0', &mut tmp)?;
238 }
239
240 if (flags & GZ_COMMENT) != 0 {
242 let mut tmp = Vec::new();
243 buffered_dzdict.read_until(b'\0', &mut tmp)?;
244 }
245
246 if (flags & GZ_FHCRC) != 0 {
248 buffered_dzdict.seek(SeekFrom::Current(2))?;
249 }
250
251 let mut chunk_offsets = Vec::with_capacity(chunk_count as usize);
253 let mut end_compressed_data = buffered_dzdict.seek(SeekFrom::Current(0))? as usize;
255 let chunks_from_header = &fextra[10usize..(10 + chunk_count * 2) as usize];
257
258 for index in (0..chunks_from_header.len()).filter(|i| (i % 2) == 0) {
260 let index = index as usize;
261 let compressed_len =
262 LittleEndian::read_u16(&chunks_from_header[index..(index + 2)]) as usize;
263 chunk_offsets.push(end_compressed_data);
264 end_compressed_data += compressed_len;
265 }
266 assert_eq!(chunk_offsets.len() as u16, chunk_count, "The read number of compressed chunks in \
267 the .dz file must be equivalent to the number of chunks actually found in the file.\n");
268
269 buffered_dzdict.seek(SeekFrom::Start(end_compressed_data as u64))?;
271 let uncompressed = buffered_dzdict.read_i32::<LittleEndian>()?;
272
273 Ok(DictReaderDz {
274 dzdict: buffered_dzdict.into_inner(),
275 chunk_offsets,
276 end_compressed_data,
277 uchunk_length: uchunk_length as usize,
278 ufile_length: uncompressed as u64,
279 })
280 }
281
282 #[cfg_attr(
283 feature = "tracing",
284 tracing::instrument(skip(self), fields(start_offset, length))
285 )]
286 fn get_chunks_for(&self, start_offset: u64, length: u64) -> Vec<Chunk> {
287 let mut chunks = Vec::new();
288 let start_chunk = start_offset as usize / self.uchunk_length;
289 let end_chunk = (start_offset + length) as usize / self.uchunk_length;
290 for id in start_chunk..=end_chunk {
291 let chunk_length = match self.chunk_offsets.get(id + 1) {
292 Some(next) => next - self.chunk_offsets[id],
293 None => self.end_compressed_data - self.chunk_offsets[id],
294 };
295 chunks.push(Chunk {
296 offset: self.chunk_offsets[id],
297 length: chunk_length,
298 });
299 }
300
301 chunks
302 }
303
304 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(data_len = data.len())))]
306 fn inflate(&self, data: Vec<u8>) -> Result<Vec<u8>, DictError> {
307 let mut decoder = flate2::Decompress::new(false);
308 let mut decoded = vec![0u8; self.uchunk_length];
309 decoder.decompress(
310 data.as_slice(),
311 decoded.as_mut_slice(),
312 flate2::FlushDecompress::None,
313 )?;
314 Ok(decoded)
315 }
316}
317
318impl<B: Read + Seek> DictReader for DictReaderDz<B> {
319 #[cfg_attr(
321 feature = "tracing",
322 tracing::instrument(skip(self), fields(start_offset, length))
323 )]
324 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
325 if length > MAX_BYTES_FOR_BUFFER {
326 return Err(DictError::MemoryError);
327 }
328 if (start_offset + length) > self.ufile_length {
329 return Err(DictError::IoError(io::Error::new(
330 io::ErrorKind::UnexpectedEof,
331 "a \
332 seek beyond the end of uncompressed data was requested",
333 )));
334 }
335 let mut data = Vec::new();
336 for chunk in self.get_chunks_for(start_offset, length) {
337 let pos = self.dzdict.seek(SeekFrom::Start(chunk.offset as u64))?;
338 if pos != (chunk.offset as u64) {
339 return Err(DictError::IoError(io::Error::new(
340 io::ErrorKind::Other,
341 format!(
342 "attempted to seek to {} but new position is {}",
343 chunk.offset, pos
344 ),
345 )));
346 }
347 let mut definition = vec![0u8; chunk.length];
348 self.dzdict.read_exact(&mut definition)?;
349 data.push(self.inflate(definition)?);
350 }
351
352 let cut_front = start_offset as usize % self.uchunk_length;
354 let data = match data.len() {
356 0 => panic!(),
357 1 => data[0][cut_front..cut_front + length as usize].to_vec(),
358 n => {
359 let mut tmp = data[0][cut_front..].to_vec();
360 for text in data.iter().skip(1).take(n - 2) {
362 tmp.extend_from_slice(text);
363 }
364 let remaining_bytes = (length as usize + cut_front) % self.uchunk_length;
366 tmp.extend_from_slice(&data[n - 1][..remaining_bytes]);
367 tmp
368 }
369 };
370 Ok(String::from_utf8(data)?)
371 }
372}