1use std::io;
2
3use crate::bytes;
4use crate::error::Result;
5use crate::raw::counting_writer::CountingWriter;
6use crate::raw::error::Error;
7use crate::raw::registry::{Registry, RegistryEntry};
8use crate::raw::{
9 CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS,
10 NONE_ADDRESS, VERSION,
11};
12use crate::stream::{IntoStreamer, Streamer};
14
15pub struct Builder<W> {
44 wtr: CountingWriter<W>,
48 unfinished: UnfinishedNodes,
53 registry: Registry,
59 last: Option<Vec<u8>>,
64 last_addr: CompiledAddr,
71 len: usize,
73}
74
75#[derive(Debug)]
76struct UnfinishedNodes {
77 stack: Vec<BuilderNodeUnfinished>,
78}
79
80#[derive(Debug)]
81struct BuilderNodeUnfinished {
82 node: BuilderNode,
83 last: Option<LastTransition>,
84}
85
86#[derive(Debug, Hash, Eq, PartialEq)]
87pub struct BuilderNode {
88 pub is_final: bool,
89 pub final_output: Output,
90 pub trans: Vec<Transition>,
91}
92
93#[derive(Debug)]
94struct LastTransition {
95 inp: u8,
96 out: Output,
97}
98
99impl Builder<Vec<u8>> {
100 #[inline]
102 pub fn memory() -> Builder<Vec<u8>> {
103 Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap()
104 }
105
106 #[inline]
108 pub fn into_fst(self) -> Fst<Vec<u8>> {
109 self.into_inner().and_then(Fst::new).unwrap()
110 }
111}
112
113impl<W: io::Write> Builder<W> {
114 pub fn new(wtr: W) -> Result<Builder<W>> {
117 Builder::new_type(wtr, 0)
118 }
119
120 pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> {
123 let mut wtr = CountingWriter::new(wtr);
124 bytes::io_write_u64_le(VERSION, &mut wtr)?;
128 bytes::io_write_u64_le(ty, &mut wtr)?;
130 Ok(Builder {
131 wtr,
132 unfinished: UnfinishedNodes::new(),
133 registry: Registry::new(10_000, 2),
134 last: None,
135 last_addr: NONE_ADDRESS,
136 len: 0,
137 })
138 }
139
140 pub fn add<B>(&mut self, bs: B) -> Result<()>
142 where
143 B: AsRef<[u8]>,
144 {
145 self.check_last_key(bs.as_ref(), false)?;
146 self.insert_output(bs, None)
147 }
148
149 pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()>
159 where
160 B: AsRef<[u8]>,
161 {
162 self.check_last_key(bs.as_ref(), true)?;
163 self.insert_output(bs, Some(Output::new(val)))
164 }
165
166 pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
175 where
176 T: AsRef<[u8]>,
177 I: IntoIterator<Item = (T, Output)>,
178 {
179 for (key, out) in iter {
180 self.insert(key, out.value())?;
181 }
182 Ok(())
183 }
184
185 pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
194 where
195 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
196 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
197 {
198 let mut stream = stream.into_stream();
199 while let Some((key, out)) = stream.next() {
200 self.insert(key, out.value())?;
201 }
202 Ok(())
203 }
204
205 pub fn finish(self) -> Result<()> {
209 self.into_inner()?;
210 Ok(())
211 }
212
213 pub fn into_inner(mut self) -> Result<W> {
216 self.compile_from(0)?;
217 let root_node = self.unfinished.pop_root();
218 let root_addr = self.compile(&root_node)?;
219 bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?;
220 bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?;
221
222 let sum = self.wtr.masked_checksum();
223 let mut wtr = self.wtr.into_inner();
224 bytes::io_write_u32_le(sum, &mut wtr)?;
225 wtr.flush()?;
226 Ok(wtr)
227 }
228
229 fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
230 where
231 B: AsRef<[u8]>,
232 {
233 let bs = bs.as_ref();
234 if bs.is_empty() {
235 self.len = 1; self.unfinished.set_root_output(out.unwrap_or(Output::zero()));
237 return Ok(());
238 }
239 let (prefix_len, out) = if let Some(out) = out {
240 self.unfinished.find_common_prefix_and_set_output(bs, out)
241 } else {
242 (self.unfinished.find_common_prefix(bs), Output::zero())
243 };
244 if prefix_len == bs.len() {
245 assert!(out.is_zero());
252 return Ok(());
253 }
254 self.len += 1;
255 self.compile_from(prefix_len)?;
256 self.unfinished.add_suffix(&bs[prefix_len..], out);
257 Ok(())
258 }
259
260 fn compile_from(&mut self, istate: usize) -> Result<()> {
261 let mut addr = NONE_ADDRESS;
262 while istate + 1 < self.unfinished.len() {
263 let node = if addr == NONE_ADDRESS {
264 self.unfinished.pop_empty()
265 } else {
266 self.unfinished.pop_freeze(addr)
267 };
268 addr = self.compile(&node)?;
269 assert!(addr != NONE_ADDRESS);
270 }
271 self.unfinished.top_last_freeze(addr);
272 Ok(())
273 }
274
275 fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> {
276 if node.is_final
277 && node.trans.is_empty()
278 && node.final_output.is_zero()
279 {
280 return Ok(EMPTY_ADDRESS);
281 }
282 let entry = self.registry.entry(&node);
283 if let RegistryEntry::Found(ref addr) = entry {
284 return Ok(*addr);
285 }
286 let start_addr = self.wtr.count() as CompiledAddr;
287 node.compile_to(&mut self.wtr, self.last_addr, start_addr)?;
288 self.last_addr = self.wtr.count() as CompiledAddr - 1;
289 if let RegistryEntry::NotFound(cell) = entry {
290 cell.insert(self.last_addr);
291 }
292 Ok(self.last_addr)
293 }
294
295 fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> {
296 if let Some(ref mut last) = self.last {
297 if check_dupe && bs == &**last {
298 return Err(Error::DuplicateKey { got: bs.to_vec() }.into());
299 }
300 if bs < &**last {
301 return Err(Error::OutOfOrder {
302 previous: last.to_vec(),
303 got: bs.to_vec(),
304 }
305 .into());
306 }
307 last.clear();
308 for &b in bs {
309 last.push(b);
310 }
311 } else {
312 self.last = Some(bs.to_vec());
313 }
314 Ok(())
315 }
316
317 pub fn get_ref(&self) -> &W {
319 self.wtr.get_ref()
320 }
321
322 pub fn bytes_written(&self) -> u64 {
324 self.wtr.count()
325 }
326}
327
328impl UnfinishedNodes {
329 fn new() -> UnfinishedNodes {
330 let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) };
331 unfinished.push_empty(false);
332 unfinished
333 }
334
335 fn len(&self) -> usize {
336 self.stack.len()
337 }
338
339 fn push_empty(&mut self, is_final: bool) {
340 self.stack.push(BuilderNodeUnfinished {
341 node: BuilderNode { is_final, ..BuilderNode::default() },
342 last: None,
343 });
344 }
345
346 fn pop_root(&mut self) -> BuilderNode {
347 assert!(self.stack.len() == 1);
348 assert!(self.stack[0].last.is_none());
349 self.stack.pop().unwrap().node
350 }
351
352 fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode {
353 let mut unfinished = self.stack.pop().unwrap();
354 unfinished.last_compiled(addr);
355 unfinished.node
356 }
357
358 fn pop_empty(&mut self) -> BuilderNode {
359 let unfinished = self.stack.pop().unwrap();
360 assert!(unfinished.last.is_none());
361 unfinished.node
362 }
363
364 fn set_root_output(&mut self, out: Output) {
365 self.stack[0].node.is_final = true;
366 self.stack[0].node.final_output = out;
367 }
368
369 fn top_last_freeze(&mut self, addr: CompiledAddr) {
370 let last = self.stack.len().checked_sub(1).unwrap();
371 self.stack[last].last_compiled(addr);
372 }
373
374 fn add_suffix(&mut self, bs: &[u8], out: Output) {
375 if bs.is_empty() {
376 return;
377 }
378 let last = self.stack.len().checked_sub(1).unwrap();
379 assert!(self.stack[last].last.is_none());
380 self.stack[last].last = Some(LastTransition { inp: bs[0], out });
381 for &b in &bs[1..] {
382 self.stack.push(BuilderNodeUnfinished {
383 node: BuilderNode::default(),
384 last: Some(LastTransition { inp: b, out: Output::zero() }),
385 });
386 }
387 self.push_empty(true);
388 }
389
390 fn find_common_prefix(&mut self, bs: &[u8]) -> usize {
391 bs.iter()
392 .zip(&self.stack)
393 .take_while(|&(&b, ref node)| {
394 node.last.as_ref().map(|t| t.inp == b).unwrap_or(false)
395 })
396 .count()
397 }
398
399 fn find_common_prefix_and_set_output(
400 &mut self,
401 bs: &[u8],
402 mut out: Output,
403 ) -> (usize, Output) {
404 let mut i = 0;
405 while i < bs.len() {
406 let add_prefix = match self.stack[i].last.as_mut() {
407 Some(ref mut t) if t.inp == bs[i] => {
408 i += 1;
409 let common_pre = t.out.prefix(out);
410 let add_prefix = t.out.sub(common_pre);
411 out = out.sub(common_pre);
412 t.out = common_pre;
413 add_prefix
414 }
415 _ => break,
416 };
417 if !add_prefix.is_zero() {
418 self.stack[i].add_output_prefix(add_prefix);
419 }
420 }
421 (i, out)
422 }
423}
424
425impl BuilderNodeUnfinished {
426 fn last_compiled(&mut self, addr: CompiledAddr) {
427 if let Some(trans) = self.last.take() {
428 self.node.trans.push(Transition {
429 inp: trans.inp,
430 out: trans.out,
431 addr,
432 });
433 }
434 }
435
436 fn add_output_prefix(&mut self, prefix: Output) {
437 if self.node.is_final {
438 self.node.final_output = prefix.cat(self.node.final_output);
439 }
440 for t in &mut self.node.trans {
441 t.out = prefix.cat(t.out);
442 }
443 if let Some(ref mut t) = self.last {
444 t.out = prefix.cat(t.out);
445 }
446 }
447}
448
449impl Clone for BuilderNode {
450 fn clone(&self) -> BuilderNode {
451 BuilderNode {
452 is_final: self.is_final,
453 final_output: self.final_output,
454 trans: self.trans.clone(),
455 }
456 }
457
458 fn clone_from(&mut self, source: &BuilderNode) {
459 self.is_final = source.is_final;
460 self.final_output = source.final_output;
461 self.trans.clear();
462 self.trans.extend(source.trans.iter());
463 }
464}
465
466impl Default for BuilderNode {
467 fn default() -> BuilderNode {
468 BuilderNode {
469 is_final: false,
470 final_output: Output::zero(),
471 trans: vec![],
472 }
473 }
474}