-- -*- coding: utf-8 -*-
--[[
Copyright 2013 Stephan Hennig
This file is part of Padrinoma.
Padrinoma is free software: you can redistribute it and/or modify it
under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Padrinoma is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public
License along with Padrinoma. If not, see
--
-- This trie requires keys to be in table representation. Table
-- representation of a key is a sequence of letters (a table
-- with values starting at index 1). The term letter here
-- refers to any valid Lua value, except the value `nil`. That is, the
-- alphabet of valid letters is not restricted to characters
-- representing letters as known from scripts. As an example, the key
-- {'h', 'e', 'l', 'l', 'o'}
consists of five letters (each
-- one a single character) and might represent the word
-- hello
, while the table {'hello'}
represents
-- an entirely different word with only a single letter (a string). Two
-- tables represent the same key, if they contain the same combination
-- of letters (the same key/value pairs). A function is provided to
-- convert an arbitrary key into table representation.
--
-- Some numbers: Storing 435,000 German words with 5.4 million
-- characters results in a trie with 1.2 million nodes consuming 77 MB
-- of memory. That is, in this example application, a single node in
-- the trie uses ca. 69 bytes of memory. Other than that, the
-- implementation is fully functional. But you have been
-- warned!
--
-- This class is derived from class `autotype-cls_pdnm_oop`.
--
--
-- @class module
-- @name autotype-cls_pdnm_trie_simple
-- @author Stephan Hennig
-- @copyright 2013, Stephan Hennig
-- API-Dokumentation can be generated via
-- -- luadoc -d API *.lua -- ---- Load third-party modules. local unicode = require('unicode') local cls_oop = require('autotype-cls_pdnm_oop') -- @trick Prevent LuaDoc from looking past here for module description. --[[ Trick LuaDoc into entering 'module' mode without using that command. module(...) --]] -- Local module table. local M = cls_oop:new() -- Short-cuts. local Tconcat = table.concat local Tinsert = table.insert local Tremove = table.remove local Ufind = unicode.utf8.find local UAsub = unicode.ascii.sub local Ugmatch = unicode.utf8.gmatch --- Create new trie node. -- This function returns a newly created trie node with no associated -- value. -- -- @param self Callee reference. -- @return Trie node. local function new_node(self) return {} end M.new_node = new_node --- Set value associated with a trie node. -- This function stores a value in a trie associated with the given -- node. -- -- @param self Callee reference. -- @param node Trie node. -- @param value New value. -- @return Old value associated with node. -- @see get_value local function set_value(self, node, value) local old_value = self.value[node] self.value[node] = value return old_value end M.set_value = set_value --- Get value associated with a trie node. -- This function retrieves a value in a trie associated with the given -- node. -- -- @param self Callee reference. -- @param node Trie node. -- @return Value associated with node. -- @see set_value local function get_value(self, node) return self.value[node] end M.get_value = get_value --- Get reference to root node. -- This function returns a reference to the root node of the trie -- object. -- -- @param self Callee reference. -- @return Root node. local function get_root(self) return self.root end M.get_root = get_root --- Traverse trie in breadth-first order. -- For every node visited, a user-supplied function is called. -- Arguments of that function are a reference to the trie object and a -- node. The order a node's child nodes are visited is unspecified. -- -- @param self Callee reference. -- @param visit User-defined function. local function bfs(self, visit) assert(type(visit) == "function", 'Bad visit function!') -- Initialize sequence of all zero level nodes (root node only). local current_level = { self:get_root() } -- Sequence of nodes of the next level. local next_level -- Is the current level sequence non-empty? while #current_level > 0 do -- Start with empty list of child nodes. next_level = {} -- Iterate over nodes of the current level. for _,node in ipairs(current_level) do -- Visit node. visit(self, node) -- Collect all child nodes. for _,child in pairs(node) do Tinsert(next_level, child) end end -- Prepare for processing next level. current_level = next_level end end M.bfs = bfs --- Debug traverse trie in breadth-first order. -- This function differs from function bfs() in that the user-supplied -- visit function can access more node related data. Visit function -- arguments are a reference to the trie object and a table with keys -- `node`, `level`, `parent`, `letter`, `prefix`. Additionally, before -- and after the set of nodes of the same level are visited, two more -- user-supplied functions are called (if non-nil). Arguments of those -- functions are a reference to the trie object and the level number. -- Since deterministic traversal is desirable for some applications, a -- node's child nodes are visited in ascending transition letter order. -- (Internal sorting is done via byte-based `table.sort`). -- -- @param self Callee reference. -- @param visit User-defined function or nil. -- @param level_preamble User-defined function or nil. -- @param level_postamble User-defined function or nil. -- @see bfs local function debug_bfs(self, visit, level_preamble, level_postamble) assert(visit == nil or type(visit) == "function", 'Bad visit function!') assert(level_preamble == nil or type(level_preamble) == "function", 'Bad level preamble!') assert(level_postamble == nil or type(level_postamble) == "function", 'Bad level postamble!') local level = 0 -- Initialize sequence of all zero level nodes (root node only). local current_level = { -- Root node. { node = self:get_root(), level = level, parent = nil, letter = nil, prefix = {}, } } -- Sequence of nodes of the next level. local next_level -- Is the current level sequence non-empty? while #current_level > 0 do -- Start with empty list of child nodes. next_level = {} -- Call level preamble function. if level_preamble ~= nil then level_preamble(self, level) end -- Iterate over nodes of the current level. for _,data in ipairs(current_level) do -- Visit node. visit(self, data) -- Sort child nodes for more deterministic behaviour. May be -- relevant when creating a visual trie representation. local a = {} for letter,child in pairs(data.node) do Tinsert(a, letter) end table.sort(a) -- Collect all child nodes. for _,letter in ipairs(a) do -- Create prefix copy. local pfx = {} for _,l in ipairs(data.prefix) do Tinsert(pfx, l) end -- Append current letter. Tinsert(pfx, letter) -- Store node. Tinsert(next_level, { node = data.node[letter], level = level, parent = data.node, letter = letter, prefix = pfx, } ) end end -- Call level postamble function. if level_postamble ~= nil then level_postamble(self, level) end -- Prepare for processing next level. current_level = next_level level = level + 1 end end M.debug_bfs = debug_bfs --- Traverse trie in depth-first order. -- For every node visited, a user-supplied function is called pre-order. -- Arguments of that function are a reference to the trie object and a -- node. The order a node's child nodes are visited is unspecified. -- -- @param self Callee reference. -- @param visit User-defined function. local function dfs(self, visit) assert(type(visit) == 'function', 'Bad visit function!') -- Workhorse function for dfs recursion. local function recurse(self, node) -- Visit node. visit(self, node) -- Recurse into child nodes. for _,child in pairs(node) do recurse(self, child) end end -- Recurse starting at trie root. recurse(self, self:get_root()) end M.dfs = dfs --- Debug traverse trie in depth-first order. -- This function differs from function dfs() in that pre-order as well -- as post-order node visiting is possible. Additionally, user-supplied -- visit function(s) can access more node related data. Visit function -- arguments are a reference to the trie object and a table with keys -- `node`, `level`, `parent`, `letter`, `prefix`. Since deterministic -- traversal is desirable for some applications, a node's child nodes -- are visited in ascending transition letter order. (Internal sorting -- is done via byte-based `table.sort`). -- -- @param self Callee reference. -- @param visit_pre_order User-defined function or nil. -- @param visit_post_order User-defined function or nil. -- @see dfs local function debug_dfs(self, visit_pre_order, visit_post_order) assert(visit_pre_order == nil or type(visit_pre_order) == 'function', 'Bad visit function!') assert(visit_post_order == nil or type(visit_post_order) == 'function', 'Bad visit function!') -- Recursion upvalue table collecting various data related to the -- node to be visited. This table is the argument to visit -- functions. local data = { -- Current node. node = nil, -- Level the current node is placed with the trie. level = nil, -- Patent node. parent = nil, -- Transition letter that lead from parent to node. letter = nil, -- A sequence containing all transition letters that lead to the -- current node from root. prefix = {}, } -- Workhorse function for dfs recursion. This function, in contrast -- to visit functions, doesn't have a table argument for memory -- efficiency reasons. local function recurse(self, node, level, parent, letter) -- Update node related data. data.node = node data.level = level data.parent = parent data.letter = letter -- Visit node pre-order. if visit_pre_order ~= nil then visit_pre_order(self, data) end -- Recurse into child nodes. for letter,child in pairs(node) do -- Update prefix. Tinsert(data.prefix, letter) recurse(self, child, level + 1, node, letter) -- Restore prefix. Tremove(data.prefix) end -- Restore node related data for post-order visitor. data.node = node data.level = level data.parent = parent data.letter = letter -- Visit node post-order. if visit_post_order ~= nil then visit_post_order(self, data) end end -- Recurse starting at trie root. recurse(self, self:get_root(), 0, nil, nil) end M.debug_dfs = debug_dfs --- Convert a key to table representation. -- A table argument is returned as is. A string argument is converted -- into a table containing UTF-8 characters as values, starting at index -- 1 (a sequence). Any non-table, non-string argument is converted into -- a table with the argument as value at index 1 (a sequence). -- -- @param key Key to convert into table representation. -- @return Key in table representation. local function key(self, key) local key_type = type(key) if key_type == 'table' then return key elseif key_type == 'string' then key_table = {} for ch in Ugmatch(key, '.') do Tinsert(key_table, ch) end return key_table else return { key } end end M.key = key --- Insert a key into a trie. -- Any existing value is replaced by the new value. -- -- @param self Callee reference. -- @param key A key in table representation. -- @param new_value A non-nil value associated with the key. -- @return Old value associated with key. local function insert(self, key, new_value) assert(type(key) == 'table','Key must be in table representation. Got ' .. type(key) .. ': ' .. tostring(key)) -- Start inserting letters at root node. local node = self.root assert(type(node) == 'table', 'Trie root not found!') -- Iterate over key letters. for _,letter in ipairs(key) do -- Search matching edge. local next = node[letter] -- Need to insert new edge? if next == nil then next = self:new_node() node[letter] = next end -- Advance. node = next end -- Save old value. local old_value = self.value[node] -- Set new value. self.value[node] = new_value -- Return old value. return old_value end M.insert = insert --- Search for a key in trie. -- -- @param self Callee reference. -- @param key Key to search in table representation. -- @return Value associated with key, or `nil` if the key cannot be found. local function find(self, key) assert(type(key) == 'table','Key must be in table representation. Got ' .. type(key) .. ': ' .. tostring(key)) -- Start searching at root node. local node = self.root assert(type(node) == 'table', 'Trie root not found!') -- Iterate over key letters. for _,letter in ipairs(key) do -- Search matching edge. node = node[letter] if node == nil then return nil end end -- Return value associated with target node. return self.value[node] end M.find = find --- Get buffer size used for reading files. -- -- @param self Callee reference. -- @return Buffer size in bytes. -- @see read_file local function get_buffer_size(self) return self.file_buffer_size end M.get_buffer_size = get_buffer_size --- Set buffer size used for reading files. -- -- @param self Callee reference. -- @param buffer_size New buffer size in bytes. -- @return Old buffer size in bytes. -- @see read_file local function set_buffer_size(self, buffer_size) local old_buffer_size = self.file_buffer_size self.file_buffer_size = buffer_size return old_buffer_size end M.set_buffer_size = set_buffer_size --- Get chunk iterator. -- Reading records from files is done through a buffer. This function -- returns an iterator over the chunks. You might want to refer to this -- iterator when implementing a custom record iterator. The file handle -- argument is not closed. -- -- @param self Callee reference. -- @param fin Input file handle. -- @return Chunk iterator. -- @see read_file -- @see file_records -- @see set_buffer_size -- @see get_buffer_size local function file_chunks(self, fin) -- Check for valid file handle. assert(fin, 'Invalid input file handle.') -- Get local reference to buffer size. local BUFFERSIZE = self:get_buffer_size() return function () -- Read next chunk. local chunk, rest = fin:read(BUFFERSIZE, '*l') -- EOF? if not chunk then return end if rest then chunk = chunk .. rest .. '\n' end return chunk end end M.file_chunks = file_chunks --- Get record iterator. -- This function returns an iterator over the records in a file. -- Records are specified by the last argument, which must be a Lua -- string pattern. The file handle argument is not closed. -- -- @param self Callee reference. -- @param fin Input file handle. -- @param rec_pattern A Lua string pattern, determining what is -- considered a record. -- @return Record iterator. -- @see read_file -- @see file_chunks -- @see set_buffer_size local function file_records(self, fin, rec_pattern) -- Chunk iterator. local next_chunk = self:file_chunks(fin) -- Initialize chunk. local chunk = next_chunk() or '' local pos = 1 return function() repeat local s, e = Ufind(chunk, rec_pattern, pos) -- Record found? if s then -- Update position. pos = e + 1 -- Return record. return UAsub(chunk, s, e) else -- Read new chunk. chunk = next_chunk() -- EOF? if not chunk then return end pos = 1 end until false end end M.file_records = file_records --- Convert a record read from a file into a key to insert into trie. -- By default, table representation of the record is returned. -- -- @param self Callee reference. -- @param record A record. -- @return Key. -- @see read_file local function record_to_key(self, record) return self:key(record) end M.record_to_key = record_to_key --- Convert a record read from a file into a value to be associated with --- a key in trie. -- By default, the value `true` is returned. -- -- @param self Callee reference. -- @param record A record. -- @return Value. -- @see read_file local function record_to_value(self, record) return true end M.record_to_value = record_to_value --- Insert a record, e.g., read from a file, into a trie, transforming --- it into a key-value pair before. -- This function is useful when reading records from a file manually, -- e.g., when file records are filtered. -- -- @param self Callee reference. -- @param record A record. -- @return Old value associated with key. -- @see insert -- @see record_to_key -- @see record_to_value local function insert_record(self, record) return self:insert(self:record_to_key(record), self:record_to_value(record)) end M.insert_record = insert_record --- Read a file and store the contents in trie. -- This function callback driven reads a file and stores the contents in -- the trie. The given file handle is not closed. -- -- Note, files are buffered while reading. Lines in the input file are -- always read as a whole into the buffer. The record pattern is always -- searched for in the current read buffer. That is, records cannot be -- larger than the file buffer. In general, this is not a restriction. -- If it is for you, you have to replace this function with a custom -- one. -- -- @param self Callee reference. -- @param fin File handle to read records from. -- @param rec_pattern A Lua string pattern, determining what is -- considered a record. If this parameter is an empty string or `nil`, -- records are considered complete lines (without line ending -- characters). -- @return Number new (keys, value) pairs stored in trie. -- @see record_to_key -- @see record_to_value -- @see file_records -- @see file_chunks -- @see set_buffer_size local function read_file(self, fin, rec_pattern) -- Initialize record pattern. if rec_pattern == nil or rec_pattern == '' then rec_pattern = '[^\n\r]+' end local count = 0 for record in self:file_records(fin, rec_pattern) do local key = self:record_to_key(record) local value = self:record_to_value(record) local old_value = self:insert(key, value) if value ~= old_value and value ~= nil then count = count + 1 end end return count end M.read_file = read_file --- Count number of nodes in trie. -- -- @param self Callee reference. -- @return Number of nodes. local function count_nodes(self) -- Init visit upvalue. local n = 0 -- Visit function. local function visit(self, node) -- Increment counter for every node. n = n + 1 end -- Recurse into trie. self:dfs(visit) return n end M.count_nodes = count_nodes --- Converts a value associated with a key in the trie into string. -- By default, Lua's `tostring` function is applied to the value. -- -- @param self Callee reference. -- @param value A value. -- @return String represenation of the value. local function value_to_string(self, value) return tostring(value) end M.value_to_string = value_to_string --- Visit function for outputting a trie in full format. -- Values are separated from key by an equals sign `=`. -- -- @param self Callee reference. -- @param data Table with node related data. -- @see debug_output -- @see debug_dfs local function debug_output_visit_full(self, data) local node = data.node local prefix = data.prefix -- Has current node an associated value? local value = self.value[node] if value ~= nil then -- Output key and value. io.write(Tconcat(prefix), '=', self:value_to_string(value), '\n') end end M.debug_output_visit_full = debug_output_visit_full --- Visit function for outputting a trie in sparse format. -- Values are separated from key by an equals sign `=`. -- -- @param self Callee reference. -- @param data Table with node related data. -- @see debug_output -- @see debug_dfs local function debug_output_visit_sparse(self, data) local node = data.node local prefix = data.prefix -- Has current node an associated value? local value = self.value[node] local output if value ~= nil then -- Output key and value. output = { Tconcat(prefix), '=', self:value_to_string(value) } else -- Count child nodes. local cnt_childs = 0 for letter,child in pairs(node) do cnt_childs = cnt_childs + 1 end -- In sparse format, if a node has more than one child node, -- output current prefix immediately. But not at root node. if cnt_childs > 1 and data.level > 0 then output = { Tconcat(prefix) } end end if output then Tinsert(output, '\n') io.write(Tconcat(output)) -- In sparse format, output letters leading to this node only -- once. for i,_ in ipairs(prefix) do data.prefix[i] = ' ' end end end M.debug_output_visit_sparse = debug_output_visit_sparse --- Output a string representation of a trie to a file. -- Values are separated from key by an equals sign `=`. Default file -- name is `trie.txt`. -- -- @param self Callee reference. -- @param fname Output file name. -- @param is_sparse flag: Output keys in sparse format? -- @see debug_output_visit_full -- @see debug_output_visit_sparse local function debug_output(self, fname, is_sparse) -- Save current default output. local fout = io.output() -- Direct default output to a custom file. io.output(type(fname) == 'string' and fname ~= '' and fname or 'trie.txt') -- Choose visit function. local visit = is_sparse and self.debug_output_visit_sparse or self.debug_output_visit_full -- Traverse trie in DFS order. self:debug_dfs(visit) -- Flush output. io.flush() -- Restore default output. io.output(fout) end M.debug_output = debug_output --- Determine a unique node name of a node for dot output. -- -- @param node A node. -- @return A string. -- @see debug_dot local function debug_dot_nname(node) -- The address of a node's table is unique. local s = tostring(node):gsub('table: ', 'n') return s end M.debug_dot_nname = debug_dot_nname --- Visit function that outputs a node in dot output. -- Nodes associated with a value are emphasized. -- -- @param self Callee reference. -- @param data Table with node related data. -- @see debug_dot -- @see debug_bfs local function debug_dot_visit_output_node(self, data) local node = data.node local nsource = debug_dot_nname(node) local source_attr = (self.value[node] ~= nil) and string.format('fillcolor="/greens7/2", fontname="Bold Italic"') or '' io.write(string.format(' %s [ label="%s", %s ]\n', nsource, table.concat(data.prefix), source_attr)) end M.debug_dot_visit_output_node = debug_dot_visit_output_node --- Output preamble for nodes of the same level in dot output. -- Set rank=same attribute for nodes of the same level. -- -- @param self Callee reference. -- @param level Node level. -- @see debug_dot -- @see debug_bfs local function debug_dot_level_preamble_output_node(self, level) io.write(string.format('\n// begin level %d\n', level)) io.write('{ rank=same // dot\n') end M.debug_dot_level_preamble_output_node = debug_dot_level_preamble_output_node --- Output postamble for nodes of the same level in dot output. -- Close sub-graph. -- -- @param self Callee reference. -- @param level Node level. -- @see debug_dot -- @see debug_bfs local function debug_dot_level_postamble_output_node(self, level) io.write('}\n') io.write(string.format('// end level %d\n\n', level)) end M.debug_dot_level_postamble_output_node = debug_dot_level_postamble_output_node --- Visit function that outputs a node's valid transition edges in dot --- output. -- -- @param self Callee reference. -- @param data Table with node related data. -- @see debug_dot -- @see debug_dfs local function debug_dot_visit_output_transition_edges(self, data) local node = data.node local nsource = debug_dot_nname(node) -- Sort outgoing letters to get more deterministic output. local a = {} for outletter,child in pairs(node) do table.insert(a, outletter) end table.sort(a) -- Output child nodes in deterministic order. for _,outletter in ipairs(a) do local child = node[outletter] local nsink = debug_dot_nname(child) io.write(string.format(' %s -> %s [ label="%s" ]\n', nsource, nsink, outletter)) end end M.debug_dot_visit_output_transition_edges = debug_dot_visit_output_transition_edges --- Output trie in graphviz dot format to a file. -- See