http://mathling.com/string/aho  library module

http://mathling.com/string/aho


Aho-Corasick implementation

Usage:
1. Construct the trie
2. Populate the trie with words
3. Initialize failure states
4. Repeatedly use the trie to find matches in text

The trie must have failure states initialized to be used, and cannot have
additional words added once it has been so initialized. You can use the
full constructor to do steps 1 through 3 in one go.

A trie cannot be have its configuration altered once it has been
populated.

Copyright© Mary Holstege 2020-2023
CC-BY (https://creativecommons.org/licenses/by/4.0/)

February 2021

Imports

http://mathling.com/core/utilities
import module namespace util="http://mathling.com/core/utilities"
       at "../core/utilities.xqy"
http://mathling.com/core/errors
import module namespace errors="http://mathling.com/core/errors"
       at "../core/errors.xqy"

Variables

Variable: $DEFAULT-OPTIONS as map(xs:string,item())


Default matching options

only-whole-words: true if complete words should match
a "word" is determined by a word-bounds-regex
allow-overlaps: true if overlapping matches will be returned
word-bounds-regex: regular expression used to detect non-whole-words
If the characters (if any) immediately before and after the
candidate do not match it, the candidate is rejected.
Note: presence of these characters within the 'word' is OK

Variable: $DEFAULT-CONFIGURATION as map(xs:string,item())


Default configuration options

case-insensitive: true if case-insensitive matching should be used
unicode-normalization: "none" or one of the normalization forms per
fn:normalize-unicode

Functions

Function: trie
declare function trie($config as map(xs:string,item()), $words as xs:string*) as map(xs:string,item())


trie()
Construct a complete trie. Once this is called, additional keywords
cannot be added.

Params
  • config as map(xs:string,item()): a map of configuration settings
  • words as xs:string*: words to put into the trie
Returns
  • map(xs:string,item())
declare function this:trie(
  $config as map(xs:string,item()),
  $words as xs:string*
) as map(xs:string,item())
{
  this:insert-all(
    this:trie($config),
    $words
  )=>this:initialize-states()
}

Function: trie
declare function trie() as map(xs:string, item())


trie()
Construct a trie using the default configuration. The trie will need
to have words added to it and failure states initialized to be used.

Returns
  • map(xs:string,item())
declare function this:trie() as map(xs:string, item())
{
  map {
    "config": $this:DEFAULT-CONFIGURATION, (: configuration settings :)
    "root": 1,                             (: index of root state :)
    "constructed-failure-states": false(), (: states initialized? :)
    "num-keywords": 0,                     (: number of entries :)
    "num-states": 1,                       (: number of states :)
    "states":                              (: state array :)
      this:state-array(this:empty-state())=>this:array-put(1, this:state(0,1)),
    "size-map": map {}                     (: cache of keyword lengths :)
  }
}

Function: trie
declare function trie($config as map(xs:string,item())) as map(xs:string,item())


trie()
Construct a trie with a specific configuration. The trie will need
to have words added to it and failure states initialized to be used.

Params
  • config as map(xs:string,item()): a map of configuration settings case-insensitive: true if case-insensitive matching should be used Default=false unicode-normalization: "none" or one of the normalization forms per fn:normalize-unicode Default=none
Returns
  • map(xs:string,item())
declare function this:trie(
  $config as map(xs:string,item())
) as map(xs:string,item())
{
  map {
    "config": util:merge-into($this:DEFAULT-CONFIGURATION,$config),
    "root": 1,
    "constructed-failure-states": false(),
    "num-keywords": 0,
    "num-states": 1,
    "states":
      this:state-array(this:empty-state())=>this:array-put(1, this:state(0,1)),
    "size-map": map {}
  }
}

Function: trie
declare function trie($config as map(xs:string,item()), $words as xs:string*, $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*) as map(xs:string,item())


trie()
Sharp tools version: if codepoint function isn't the same as used
in matching against the trie, weird things can happen
Construct a complete trie. Once this is called, additional keywords
cannot be added.

Params
  • config as map(xs:string,item()): a map of configuration settings case-insensitive: true if case-insensitive matching should be used Default=false unicode-normalization: "none" or one of the normalization forms per fn:normalize-unicode
  • words as xs:string*: words to put into the trie
  • codepoint-function as function(xs:string,map(xs:string,item()))asxs:integer*: function to use to map strings to codepoints
Returns
  • map(xs:string,item())
declare function this:trie(
  $config as map(xs:string,item()),
  $words as xs:string*,
  $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*
) as map(xs:string,item())
{
  this:insert-all(
    this:trie($config),
    $words,
    $codepoint-function
  )=>this:initialize-states()
}

Function: insert
declare function insert($trie as map(xs:string,item()), $word as xs:string) as map(xs:string,item())


insert()
Insertion of a word into the trie. This cannot be performed
on a completed trie that has had its states initialized. Return the
updated trie.

Params
  • trie as map(xs:string,item()): input trie
  • word as xs:string: word to add
Returns
  • map(xs:string,item())
declare function this:insert(
  $trie as map(xs:string,item()),
  $word as xs:string
) as map(xs:string,item())
{
  util:assert(not($trie("constructed-failure-states")), "Cannot add to trie with constructed failure states"),
  if (empty($word)) then $trie
  else this:insert($trie, $word, this:codepoints#2)
}

Function: insert
declare function insert($trie as map(xs:string,item()), $word as xs:string, $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*) as map(xs:string,item())


insert()
Insertion of a word into the trie. This cannot be performed
on a completed trie that has had its states initialized. Return the
updated trie.

Sharp tools version: if codepoint function isn't the same as used
in matching against the trie, weird things can happen

Params
  • trie as map(xs:string,item()): input trie
  • word as xs:string: word to add
  • codepoint-function as function(xs:string,map(xs:string,item()))asxs:integer*: function to use to map strings to codepoints
Returns
  • map(xs:string,item())
declare function this:insert(
  $trie as map(xs:string,item()),
  $word as xs:string,
  $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*
) as map(xs:string,item())
{
  util:assert(not($trie("constructed-failure-states")), "Cannot add to trie with constructed failure states"),
  if (empty($word)) then $trie else
  let $codepoints := $codepoint-function($word, $trie("config"))
  let $data := 
    fold-left(
      $codepoints,
      ($trie, $trie=>this:get-state($trie("root"))),
      function ($data as map(*)*, $codepoint as xs:integer) as map(*)* { 
        let $trie := $data[1]
        let $cur-state := $data[2]
        return (
          (: add-state returns (trie, (possibly)updated state) :)
          $cur-state=>this:add-state($trie, $codepoint)
        )
      }
    )
  let $trie := $data[1]
  let $cur-state := $data[2]
  let $num-keywords := $trie("num-keywords") + 1
  return (
    $trie=>
      this:put-state($cur-state("id"),
        $cur-state=>this:add-emit($word, $num-keywords)
      )=>
      map:put("size-map",
        $trie("size-map")=>map:put($word, count($codepoints))
      )=>
      map:put("num-keywords", $num-keywords)
  )
}

Function: insert-all
declare function insert-all($trie as map(xs:string,item()), $words as xs:string*) as map(xs:string,item())


insert-all()
Bulk insertion of words into the trie. This cannot be performed
on a completed trie that has had its states initialized. Return the
updated trie.

Params
  • trie as map(xs:string,item()): input trie
  • words as xs:string*: words to add
Returns
  • map(xs:string,item())
declare function this:insert-all(
  $trie as map(xs:string,item()),
  $words as xs:string*
) as map(xs:string,item())
{
  fold-left(
    $words, $trie,
    function ($trie as map(xs:string,item()), $word as xs:string) as map(xs:string,item()) {
      $trie=>this:insert($word)
    }
  )
}

Function: insert-all
declare function insert-all($trie as map(xs:string,item()), $words as xs:string*, $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*) as map(xs:string,item())


insert-all()
Bulk insertion of words into the trie. This cannot be performed
on a completed trie that has had its states initialized. Return the
updated trie.

Sharp tools version: if codepoint function isn't the same as used
in matching against the trie, weird things can happen

Params
  • trie as map(xs:string,item()): input trie
  • words as xs:string*: words to add
  • codepoint-function as function(xs:string,map(xs:string,item()))asxs:integer*: function to use to map strings to codepoints
Returns
  • map(xs:string,item())
declare function this:insert-all(
  $trie as map(xs:string,item()),
  $words as xs:string*,
  $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*
) as map(xs:string,item())
{
  fold-left(
    $words,
    $trie,
    function ($trie as map(xs:string,item()), $word as xs:string) as map(xs:string,item()) {
      $trie=>this:insert($word, $codepoint-function)
    }
  )
}

Function: initialize-states
declare function initialize-states($trie as map(xs:string,item())) as map(xs:string,item())


initialize-states()
Initialize the failure states in the trie and return the updated trie.
Once the trie has been completed in this way, more words cannot be added
to it.

Params
  • trie as map(xs:string,item()): trie to initialize
Returns
  • map(xs:string,item())
declare function this:initialize-states(
  $trie as map(xs:string,item())
) as map(xs:string,item())
{
  if ($trie("constructed-failure-states")) then $trie
  else this:construct-failure-states($trie)
}

Function: get-matches
declare function get-matches($trie as map(xs:string,item()), $text as xs:string, $options as map(xs:string,item())) as map(xs:string,item())*


get-matches()
Find matches to the words in the trie in the input text. Matching is
performed according to the trie's configuration. Matches are returned
as "emit" objects, in order of the match. Matches with the same starting
point (if overlaps are allowed) are returned shortest first. The accessors
start(), end(), keyword(), and index() can be used to get information about
the matches.
start(): position of start of match in text
end(): position of end of match in text
keyword(): matching keyword string
index(): index of keyword (position in the input set); this can be used
to provide linkage to metadata related to the dictionary

Params
  • trie as map(xs:string,item()): trie of words
  • text as xs:string: input text to match
  • options as map(xs:string,item()): matching options only-whole-words: true if complete words should match a "word" is determined by a word-bounds-regex Default=false allow-overlaps: true if overlapping matches will be returned Default=false word-bounds-regex: regular expression used to detect non-whole-words If the characters (if any) immediately before and after the candidate do not match it, the candidate is rejected. Default=non-letter, non-number [^\p{L}\p{N}] Note: presence of these characters within the 'word' is OK
Returns
  • map(xs:string,item())*
declare function this:get-matches(
  $trie as map(xs:string,item()),
  $text as xs:string,
  $options as map(xs:string,item())
) as map(xs:string,item())*
{
  this:get-matches($trie, $text, this:codepoints#2, $options)
}

Function: get-matches
declare function get-matches($trie as map(xs:string,item()), $text as xs:string) as map(xs:string,item())*


get-matches()
Find matches to the words in the trie in the input text using default
matching options.

Params
  • trie as map(xs:string,item()): trie of words
  • text as xs:string: input text to match
Returns
  • map(xs:string,item())*
declare function this:get-matches(
  $trie as map(xs:string,item()),
  $text as xs:string
) as map(xs:string,item())*
{
  this:get-matches($trie, $text, this:codepoints#2, map {})
}

Function: get-matches
declare function get-matches($trie as map(xs:string,item()), $text as xs:string, $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*, $options as map(xs:string,item())) as map(xs:string,item())*


get-matches()
Find matches to the words in the trie in the input text. Matching is
performed according to the trie's configuration. Matches are returned
as "emit" objects, in order of the match. Matches with the same starting
point (if overlaps are allowed) are returned shortest first. The accessors
start(), end(), keyword(), and index() can be used to get information about
the matches.
start(): position of start of match in text
end(): position of end of match in text
keyword(): matching keyword string
index(): index of keyword (position in the input set); this can be used
to provide linkage to metadata related to the dictionary

Sharp tools version of get-matches

Params
  • trie as map(xs:string,item()): trie of words
  • text as xs:string: input text to match
  • codepoint-function as function(xs:string,map(xs:string,item()))asxs:integer*: function to use to map strings to codepoints: if this does not match the codepoint function used to construct the trie, weird things can happen
  • options as map(xs:string,item()): matching options only-whole-words: true if complete words should match a "word" is determined by a word-bounds-regex Default=false allow-overlaps: true if overlapping matches will be returned Default=false word-bounds-regex: regular expression used to detect non-whole-words If the characters (if any) immediately before and after the candidate do not match it, the candidate is rejected. Default=non-letter, non-number [^\p{L}\p{N}] Note: presence of these characters within the 'word' is OK
Returns
  • map(xs:string,item())*
declare function this:get-matches(
  $trie as map(xs:string,item()),
  $text as xs:string,
  $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*,
  $options as map(xs:string,item())
) as map(xs:string,item())*
{
  let $options := util:merge-into($this:DEFAULT-OPTIONS, $options)
  let $codepoints := $codepoint-function($text, $trie("config"))
  let $collected-emits := 
    fold-left(
      1 to count($codepoints),
      $trie=>this:get-state($trie("root")),
      function ($emits-and-state as map(*)*, $pos as xs:integer) as map(*)* {
        let $cur-state := $emits-and-state[last()]
        let $emits := $emits-and-state[position() < last()]
        let $new-state := $cur-state=>this:get-state($trie, $codepoints[$pos])
        return (
          $emits,
          this:get-emits($trie, $new-state, $pos),
          if (empty($new-state)) then this:empty-state() else $new-state
        )
      }
    )[position() < last()]
  let $incompletes-purged :=
    if ($options("only-whole-words")) then (
      let $regex := $options("word-bounds-regex")
      return this:remove-partial-matches($text, $collected-emits, $regex)
    ) else $collected-emits
  let $overlaps-purged :=
    if ($options("allow-overlaps"))
    then $incompletes-purged
    else (
      let $tree := this:node($incompletes-purged)
      return this:remove-overlaps($tree, $incompletes-purged)
    )
  for $emit in $overlaps-purged
  order by this:start($emit) ascending, this:end($emit) ascending
  return $emit
}

Function: start
declare function start($emit as map(xs:string,item())) as xs:integer


start()
Accessor for starting string offset of emitted match in the text.

Params
  • emit as map(xs:string,item()): match returned from get-matches()
Returns
  • xs:integer
declare function this:start($emit as map(xs:string,item())) as xs:integer
{
  $emit("start")
}

Function: end
declare function end($emit as map(xs:string,item())) as xs:integer


end()
Accessor for ending string offset of emitted match in the text.

Params
  • emit as map(xs:string,item()): match returned from get-matches()
Returns
  • xs:integer
declare function this:end($emit as map(xs:string,item())) as xs:integer
{
  $emit("end")
}

Function: keyword
declare function keyword($emit as map(xs:string,item())) as xs:string


keyword()
Accessor for the matching word in the emitted match.

Params
  • emit as map(xs:string,item()): match returned from get-matches()
Returns
  • xs:string
declare function this:keyword($emit as map(xs:string,item())) as xs:string
{
  $emit("keyword")
}

Function: index
declare function index($emit as map(xs:string,item())) as xs:integer


index()
Accessor for keyword index of emitted match. This is the ordinal number
of the added word. It can be used to correlate the match with other
metadata.

Params
  • emit as map(xs:string,item()): match returned from get-matches()
Returns
  • xs:integer
declare function this:index($emit as map(xs:string,item())) as xs:integer
{
  $emit("index")
}

Function: interval-size
declare function interval-size($emit as map(xs:string,item())) as xs:integer


interval-size()
Convenience function that computes the length of the match.

Params
  • emit as map(xs:string,item()): match returned from get-matches()
Returns
  • xs:integer
declare function this:interval-size($emit as map(xs:string,item())) as xs:integer
{
  this:end($emit) - this:start($emit) + 1
}

Function: keyword
declare function keyword($trie as map(xs:string,item()), $index as xs:integer) as xs:string


keyword()
Accessor for keywords in the trie.
Not very efficient. Alternative: keep a full list of the keywords.
This is only really useful for replace() with a $N pattern, which is
a low probability use case, honestly.

Params
  • trie as map(xs:string,item()): the trie
  • index as xs:integer: keyword index
Returns
  • xs:string
declare function this:keyword(
  $trie as map(xs:string,item()),
  $index as xs:integer
) as xs:string
{
  let $num-states := $trie=>map:get("num-states")
  return
    tail(
      util:while(
        function ($state as xs:integer, $keyword as xs:string) as xs:boolean { empty($keyword) and $state < $num-states },
        function ($state as xs:integer, $keyword as xs:string) as item()* {
          $state + 1,
          let $emits := $trie=>this:get-state($state)=>map:get("emits")
          for $keyword in $emits=>map:keys()
          where $emits=>map:get($keyword)=$index
          return $keyword
        },
        1, ()
      )
    )
}

Function: codepoints
declare function codepoints($string as xs:string, $config as map(xs:string, item())) as xs:integer*


codepoints()
Default codepoint function. Does a string-to-codepoints, possibly with
mapping to lower case followed by Unicode normalization, depending on
configuration parameters.

Params
  • string as xs:string: input string
  • config as map(xs:string,item()): configuration parameters "case-insensitive": boolean, whether to map to lowercase (default=false()) "unicode-normalization": string, one of "none" or one of the Unicode normalization forms accepted by normalize-unicode(), whether to perform Unicode normalization on the string
Returns
  • xs:integer*
declare function this:codepoints(
  $string as xs:string,
  $config as map(xs:string, item())
) as xs:integer*
{
  string-to-codepoints(
    let $case-insensitive := ($config("case-insensitive"),false())[1]
    let $normalization := ($config("unicode-normalization"),"none")[1]
    let $case-string :=
      if ($case-insensitive) then lower-case($string) else $string
    return
      if ($normalization="none")
      then $case-string
      else normalize-unicode($case-string, $normalization)
  )
}

Function: extract
declare function extract($input as node()*, $trie as map(xs:string,item()), $extract-function as function((:text:)xs:string, (:match:)map(xs:string,item())) as node()*) as node()*


extract()
Extract all the matches from the text nodes in the input using the
default match options, and return them as handled by the extract function.

Params
  • input as node()*: nodes to process
  • trie as map(xs:string,item()): the trie
  • extract-function as function(xs:string,map(xs:string,item()))asnode()*: a function to process each match The function is given the matching text from the input and the match itself
Returns
  • node()*
declare function this:extract(
  $input as node()*,
  $trie as map(xs:string,item()),
  $extract-function as
    function((:text:)xs:string,
             (:match:)map(xs:string,item())) as node()*
) as node()*
{
  this:extract($input, $trie, $extract-function, map {})
}

Function: extract
declare function extract($input as node()*, $trie as map(xs:string,item()), $extract-function as function((:text:)xs:string, (:match:)map(xs:string,item())) as node()*, $match-options as map(xs:string,item()*)?) as node()*


extract()
Extract all the matches from the text nodes in the input using the
given match options, and return them as handled by the extract function.

Params
  • input as node()*: nodes to process
  • trie as map(xs:string,item()): the trie
  • extract-function as function(xs:string,map(xs:string,item()))asnode()*: a function to process each match The function is given the matching text from the input and the match itself
  • match-options as map(xs:string,item()*)?: options to use for matching keywords
Returns
  • node()*
declare function this:extract(
  $input as node()*,
  $trie as map(xs:string,item()),
  $extract-function as
    function((:text:)xs:string,
             (:match:)map(xs:string,item())) as node()*,
  $match-options as map(xs:string,item()*)?
) as node()*
{
  let $options := util:merge-into($this:DEFAULT-OPTIONS, $match-options)
  for $text in $input/text()
  for $match in $trie=>this:get-matches($text, $options)
  return (
    $extract-function(
      substring($text, this:start($match), this:interval-size($match)),
      $match
    )
  )
}

Function: markup
declare function markup($input as node()*, $trie as map(xs:string,item()), $markup-function as function((:text:)xs:string, (:match:)map(xs:string,item())) as node()*) as node()*


markup()
Walk the input nodes and return them with all the matches replaced
with the output of the extract function over the given match. Use the
default match options.

Params
  • input as node()*: nodes to process
  • trie as map(xs:string,item()): the entity trie
  • markup-function as function(xs:string,map(xs:string,item()))asnode()*: a function to process each match The function is given the matching text from the input and the match itself
Returns
  • node()*
declare function this:markup(
  $input as node()*,
  $trie as map(xs:string,item()),
  $markup-function as
    function((:text:)xs:string,
             (:match:)map(xs:string,item())) as node()*
) as node()*
{
  this:markup($input, $trie, $markup-function, map {})
}

Function: markup
declare function markup($input as node()*, $trie as map(xs:string,item()), $markup-function as function((:text:)xs:string, (:match:)map(xs:string,item())) as node()*, $match-options as map(xs:string,item()*)?) as node()*


markup()
Walk the input nodes and return them with all the matches replaced
with the output of the extract function over the given match. Use the
given match options.

Params
  • input as node()*: nodes to process
  • trie as map(xs:string,item()): the entity trie
  • markup-function as function(xs:string,map(xs:string,item()))asnode()*: a function to process each match The function is given the matching text from the input and the match itself
  • match-options as map(xs:string,item()*)?: options to use for matching keywords
Returns
  • node()*
declare function this:markup(
  $input as node()*,
  $trie as map(xs:string,item()),
  $markup-function as
    function((:text:)xs:string,
             (:match:)map(xs:string,item())) as node()*,
  $match-options as map(xs:string,item()*)?
) as node()*
{
  let $options := util:merge-into($this:DEFAULT-OPTIONS, $match-options)
  for $node in $input return typeswitch($node)
  case document-node() return
    document {
      this:markup($node/node(), $trie, $markup-function, $match-options)
    }
  case element() return
    element {node-name($node)} {
      this:markup($node/node(), $trie, $markup-function, $match-options)
    }
  case text() return
    let $text := string($node)
    let $matches := $trie=>this:get-matches($text, $options)
    let $markup :=
      fold-left(
        $matches, (1),
        function ($markup as item()*, $match as map(xs:string,item())*) as item()* {
          let $pos := head($markup)
          let $next := this:start($match)
          return (
            this:end($match)+1, tail($markup), (
              if ($next > $pos) then (
                text { substring($text, $pos, $next - $pos) }
              ) else (),
              $markup-function(
                substring($text, this:start($match), this:interval-size($match)),
                $match
              )
            )
          )
        }
      )
    return (
      tail($markup),
      text { substring($text, head($markup)) }
    )
  default return $node
}

Function: normalize
declare function normalize($matching as xs:string, $match as map(xs:string,item())) as node()*


normalize()
Replace the matching text with the keyword
If we did case folding or other codepoint manipulations this can make
a difference.

Params
  • matching as xs:string
  • match as map(xs:string,item())
Returns
  • node()*
declare %art:markup function this:normalize(
  $matching as xs:string,
  $match as map(xs:string,item())
) as node()*
{
  text {this:keyword($match)}
}

Function: remove
declare function remove($matching as xs:string, $match as map(xs:string,item())) as node()*


remove()
Remove the matching text.

Params
  • matching as xs:string
  • match as map(xs:string,item())
Returns
  • node()*
declare %art:markup function this:remove(
  $matching as xs:string,
  $match as map(xs:string,item())
) as node()*
{
  ()
}

Function: full-element
declare function full-element($matching as xs:string, $match as map(xs:string,item())) as node()*


full-entity()
Wrap the matching text in markup. The element will include attributes
with the keyword id, the start position of the match (useful for extract()),
and the matching keyword (useful if we did case folding or other codepoint
mapping).

Params
  • matching as xs:string
  • match as map(xs:string,item())
Returns
  • node()*
declare %art:markup function this:full-element(
  $matching as xs:string,
  $match as map(xs:string,item())
) as node()*
{
  element {QName("http://mathling.com/entity", "e:keyword")} {
    attribute id {this:index($match)},
    attribute keyword {this:keyword($match)},
    attribute start {this:start($match)},
    $matching
  }
}

Function: basic-element
declare function basic-element($matching as xs:string, $match as map(xs:string,item())) as node()*


basic-entity()
Wrap the matching text in markup. The element will include an attribute
with the keyword id.

Params
  • matching as xs:string
  • match as map(xs:string,item())
Returns
  • node()*
declare %art:markup function this:basic-element(
  $matching as xs:string,
  $match as map(xs:string,item())
) as node()*
{
  element {QName("http://mathling.com/entity", "e:keyword")} {
    attribute id {this:index($match)},
    $matching
  }
}

Function: load-compiled
declare function load-compiled($file as xs:string) as map(xs:string,item())


load-compiled()
Load a compiled trie. Return a completed trie, with the states computed.

Params
  • file as xs:string: file containing the saved compiled trie
Returns
  • map(xs:string,item())
declare function this:load-compiled($file as xs:string) as map(xs:string,item())
{
  this:parse(unparsed-text($file))
}

Function: save-compiled
declare function save-compiled($to as xs:string, $trie as map(xs:string,item())) as empty-sequence()


save-compiled()
Save a compiled trie to a file. Requires Saxon-PE or Saxon-EE. If
you have Saxon-HE set query output method to text and use serialize()
directly.

Params
  • to as xs:string: file to write to
  • trie as map(xs:string,item()): completed trie to save
Returns
declare function this:save-compiled(
  $to as xs:string, 
  $trie as map(xs:string,item())
) as empty-sequence()
{
  $this:SAVE-IMPL($to, serialize($trie, map {"method": "json"}))
}

Function: parse
declare function parse($compiled as xs:string) as map(xs:string,item())


parse()
Parse a compiled trie. Return a completed trie, with the states computed.

Params
  • compiled as xs:string: string containing the compiled format
Returns
  • map(xs:string,item())
declare function this:parse($compiled as xs:string) as map(xs:string,item())
{
  (: Fix up raw JSON because all integers got turned into doubles, alas :)
  this:fix-trie(parse-json($compiled))
}

Function: fix-trie
declare function fix-trie($raw as map(xs:string, item())) as map(xs:string,item())


fix-trie()
Fix a trie read directly from JSON. Useful when wrapping compiled trie
with more data.

Params
  • raw as map(xs:string,item()): the raw map parsed from JSON
Returns
  • map(xs:string,item())
declare function this:fix-trie(
  $raw as map(xs:string, item())
) as map(xs:string,item())
{
  $raw=>
    map:put("root", 1)=>
    map:put("num-states", xs:integer($raw("num-states")))=>
    map:put("num-keywords", xs:integer($raw("num-keywords")))=>
    map:put("states", this:fix-states($raw("states")))
}

Function: compile
declare function compile($trie as map(xs:string,item())) as xs:string


compile()
Serialize the trie in a format suitable for parse() to unpack.

Params
  • trie as map(xs:string,item()): the completed trie
Returns
  • xs:string
declare function this:compile($trie as map(xs:string,item())) as xs:string
{
  serialize($trie, map {"method": "json"})
}

Original Source Code

xquery version "3.1";
(:~
 : Aho-Corasick implementation
 :
 : Usage:
 : 1. Construct the trie
 : 2. Populate the trie with words
 : 3. Initialize failure states
 : 4. Repeatedly use the trie to find matches in text
 : 
 : The trie must have failure states initialized to be used, and cannot have
 : additional words added once it has been so initialized. You can use the
 : full constructor to do steps 1 through 3 in one go.
 : 
 : A trie cannot be have its configuration altered once it has been 
 : populated.
 :
 : Copyright© Mary Holstege 2020-2023
 : CC-BY (https://creativecommons.org/licenses/by/4.0/)
 : @since February 2021
 :)
module namespace this="http://mathling.com/string/aho";

declare namespace art="http://mathling.com/art";
declare namespace map="http://www.w3.org/2005/xpath-functions/map";
declare namespace array="http://www.w3.org/2005/xpath-functions/array";
declare namespace math="http://www.w3.org/2005/xpath-functions/math";

import module namespace errors="http://mathling.com/core/errors"
       at "../core/errors.xqy";
import module namespace util="http://mathling.com/core/utilities"
       at "../core/utilities.xqy";

(:~
 : Default matching options
 :
 : only-whole-words: true if complete words should match
 :   a "word" is determined by a word-bounds-regex
 : allow-overlaps: true if overlapping matches will be returned
 : word-bounds-regex: regular expression used to detect non-whole-words
 :   If the characters (if any) immediately before and after the
 :   candidate do not match it, the candidate is rejected.
 :   Note: presence of these characters within the 'word' is OK
 :)
declare variable $this:DEFAULT-OPTIONS as map(xs:string,item()) :=
  map {
    "only-whole-words": false(),
    "allow-overlaps": false(),
    "word-bounds-regex": "[^\p{L}\p{N}]"
  }
;

(:~
 : Default configuration options
 :
 : case-insensitive: true if case-insensitive matching should be used
 : unicode-normalization: "none" or one of the normalization forms per
 :   fn:normalize-unicode
 :)
declare variable $this:DEFAULT-CONFIGURATION as map(xs:string,item()) :=
  map {
    "case-insensitive": false(),
    "unicode-normalization": "none"
  }
;

(:======================================================================:
 : Public API
 :======================================================================:)

(:======================================================================:
 : Trie construction
 :======================================================================:)

(:~ 
 : trie()
 : Construct a complete trie. Once this is called, additional keywords
 : cannot be added.
 :
 : @param $config: a map of configuration settings
 : @param $words: words to put into the trie
 :
 :)
declare function this:trie(
  $config as map(xs:string,item()),
  $words as xs:string*
) as map(xs:string,item())
{
  this:insert-all(
    this:trie($config),
    $words
  )=>this:initialize-states()
};

(:~ 
 : trie()
 : Construct a trie using the default configuration. The trie will need
 : to have words added to it and failure states initialized to be used.
 :)
declare function this:trie() as map(xs:string, item())
{
  map {
    "config": $this:DEFAULT-CONFIGURATION, (: configuration settings :)
    "root": 1,                             (: index of root state :)
    "constructed-failure-states": false(), (: states initialized? :)
    "num-keywords": 0,                     (: number of entries :)
    "num-states": 1,                       (: number of states :)
    "states":                              (: state array :)
      this:state-array(this:empty-state())=>this:array-put(1, this:state(0,1)),
    "size-map": map {}                     (: cache of keyword lengths :)
  }
};

(:~ 
 : trie()
 : Construct a trie with a specific configuration. The trie will need
 : to have words added to it and failure states initialized to be used.
 :
 : @param $config: a map of configuration settings
 :   case-insensitive: true if case-insensitive matching should be used
 :     Default=false
 :   unicode-normalization: "none" or one of the normalization forms per
 :     fn:normalize-unicode
 :     Default=none
 :)
declare function this:trie(
  $config as map(xs:string,item())
) as map(xs:string,item())
{
  map {
    "config": util:merge-into($this:DEFAULT-CONFIGURATION,$config),
    "root": 1,
    "constructed-failure-states": false(),
    "num-keywords": 0,
    "num-states": 1,
    "states":
      this:state-array(this:empty-state())=>this:array-put(1, this:state(0,1)),
    "size-map": map {}
  }
};

(:~ 
 : trie()
 : Sharp tools version: if codepoint function isn't the same as used 
 : in matching against the trie, weird things can happen
 : Construct a complete trie. Once this is called, additional keywords
 : cannot be added.
 :
 : @param $config: a map of configuration settings
 :   case-insensitive: true if case-insensitive matching should be used
 :     Default=false
 : unicode-normalization: "none" or one of the normalization forms per
 :   fn:normalize-unicode
 : @param $words: words to put into the trie
 : @param $codepoint-function: function to use to map strings to codepoints
 :)
declare function this:trie(
  $config as map(xs:string,item()),
  $words as xs:string*,
  $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*
) as map(xs:string,item())
{
  this:insert-all(
    this:trie($config),
    $words,
    $codepoint-function
  )=>this:initialize-states()
};

(:~
 : insert()
 : Insertion of a word into the trie. This cannot be performed
 : on a completed trie that has had its states initialized. Return the
 : updated trie.
 :
 : @param $trie: input trie
 : @param $word: word to add
 :)
declare function this:insert(
  $trie as map(xs:string,item()),
  $word as xs:string
) as map(xs:string,item())
{
  util:assert(not($trie("constructed-failure-states")), "Cannot add to trie with constructed failure states"),
  if (empty($word)) then $trie
  else this:insert($trie, $word, this:codepoints#2)
};

(:~
 : insert()
 : Insertion of a word into the trie. This cannot be performed
 : on a completed trie that has had its states initialized. Return the
 : updated trie.
 :
 : Sharp tools version: if codepoint function isn't the same as used 
 : in matching against the trie, weird things can happen
 :
 : @param $trie: input trie
 : @param $word: word to add
 : @param $codepoint-function: function to use to map strings to codepoints
 :)
declare function this:insert(
  $trie as map(xs:string,item()),
  $word as xs:string,
  $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*
) as map(xs:string,item())
{
  util:assert(not($trie("constructed-failure-states")), "Cannot add to trie with constructed failure states"),
  if (empty($word)) then $trie else
  let $codepoints := $codepoint-function($word, $trie("config"))
  let $data := 
    fold-left(
      $codepoints,
      ($trie, $trie=>this:get-state($trie("root"))),
      function ($data as map(*)*, $codepoint as xs:integer) as map(*)* { 
        let $trie := $data[1]
        let $cur-state := $data[2]
        return (
          (: add-state returns (trie, (possibly)updated state) :)
          $cur-state=>this:add-state($trie, $codepoint)
        )
      }
    )
  let $trie := $data[1]
  let $cur-state := $data[2]
  let $num-keywords := $trie("num-keywords") + 1
  return (
    $trie=>
      this:put-state($cur-state("id"),
        $cur-state=>this:add-emit($word, $num-keywords)
      )=>
      map:put("size-map",
        $trie("size-map")=>map:put($word, count($codepoints))
      )=>
      map:put("num-keywords", $num-keywords)
  )
};

(:~
 : insert-all()
 : Bulk insertion of words into the trie. This cannot be performed
 : on a completed trie that has had its states initialized. Return the
 : updated trie.
 :
 : @param $trie: input trie
 : @param $words: words to add
 :)
declare function this:insert-all(
  $trie as map(xs:string,item()),
  $words as xs:string*
) as map(xs:string,item())
{
  fold-left(
    $words, $trie,
    function ($trie as map(xs:string,item()), $word as xs:string) as map(xs:string,item()) {
      $trie=>this:insert($word)
    }
  )
};

(:~
 : insert-all()
 : Bulk insertion of words into the trie. This cannot be performed
 : on a completed trie that has had its states initialized. Return the
 : updated trie.
 :
 : Sharp tools version: if codepoint function isn't the same as used 
 : in matching against the trie, weird things can happen
 :
 : @param $trie: input trie
 : @param $words: words to add
 : @param $codepoint-function: function to use to map strings to codepoints
 :)
declare function this:insert-all(
  $trie as map(xs:string,item()),
  $words as xs:string*,
  $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*
) as map(xs:string,item())
{
  fold-left(
    $words,
    $trie,
    function ($trie as map(xs:string,item()), $word as xs:string) as map(xs:string,item()) {
      $trie=>this:insert($word, $codepoint-function)
    }
  )
};

(:~
 : initialize-states()
 : Initialize the failure states in the trie and return the updated trie.
 : Once the trie has been completed in this way, more words cannot be added
 : to it.
 :
 : @param $trie: trie to initialize
 :)
declare function this:initialize-states(
  $trie as map(xs:string,item())
) as map(xs:string,item())
{
  if ($trie("constructed-failure-states")) then $trie
  else this:construct-failure-states($trie)
};

(:======================================================================:
 : Matching
 :======================================================================:)

(:~
 : get-matches()
 : Find matches to the words in the trie in the input text. Matching is
 : performed according to the trie's configuration. Matches are returned
 : as "emit" objects, in order of the match. Matches with the same starting
 : point (if overlaps are allowed) are returned shortest first. The accessors
 : start(), end(), keyword(), and index() can be used to get information about
 : the matches.
 :   start(): position of start of match in text
 :   end(): position of end of match in text
 :   keyword(): matching keyword string
 :   index(): index of keyword (position in the input set); this can be used
 :     to provide linkage to metadata related to the dictionary
 :
 : @param $trie: trie of words
 : @param $text: input text to match
 : @param $options: matching options
 :   only-whole-words: true if complete words should match
 :     a "word" is determined by a word-bounds-regex
 :     Default=false
 :   allow-overlaps: true if overlapping matches will be returned
 :     Default=false
 :   word-bounds-regex: regular expression used to detect non-whole-words
 :     If the characters (if any) immediately before and after the
 :     candidate do not match it, the candidate is rejected.
 :     Default=non-letter, non-number [^\p{L}\p{N}]
 :     Note: presence of these characters within the 'word' is OK
 : 
 :)
declare function this:get-matches(
  $trie as map(xs:string,item()),
  $text as xs:string,
  $options as map(xs:string,item())
) as map(xs:string,item())*
{
  this:get-matches($trie, $text, this:codepoints#2, $options)
};

(:~
 : get-matches()
 : Find matches to the words in the trie in the input text using default
 : matching options.
 :
 : @param $trie: trie of words
 : @param $text: input text to match
 :)
declare function this:get-matches(
  $trie as map(xs:string,item()),
  $text as xs:string
) as map(xs:string,item())*
{
  this:get-matches($trie, $text, this:codepoints#2, map {})
};

(:~
 : get-matches()
 : Find matches to the words in the trie in the input text. Matching is
 : performed according to the trie's configuration. Matches are returned
 : as "emit" objects, in order of the match. Matches with the same starting
 : point (if overlaps are allowed) are returned shortest first. The accessors
 : start(), end(), keyword(), and index() can be used to get information about
 : the matches.
 :   start(): position of start of match in text
 :   end(): position of end of match in text
 :   keyword(): matching keyword string
 :   index(): index of keyword (position in the input set); this can be used
 :     to provide linkage to metadata related to the dictionary
 : 
 : Sharp tools version of get-matches
 :
 : @param $trie: trie of words
 : @param $text: input text to match
 : @param $codepoint-function: function to use to map strings to codepoints:
 :   if this does not match the codepoint function used to construct the trie,
 :   weird things can happen
 : @param $options: matching options
 :   only-whole-words: true if complete words should match
 :     a "word" is determined by a word-bounds-regex
 :     Default=false
 :   allow-overlaps: true if overlapping matches will be returned
 :     Default=false
 :   word-bounds-regex: regular expression used to detect non-whole-words
 :     If the characters (if any) immediately before and after the
 :     candidate do not match it, the candidate is rejected.
 :     Default=non-letter, non-number [^\p{L}\p{N}]
 :     Note: presence of these characters within the 'word' is OK
 :)
declare function this:get-matches(
  $trie as map(xs:string,item()),
  $text as xs:string,
  $codepoint-function as function(xs:string,map(xs:string,item())) as xs:integer*,
  $options as map(xs:string,item())
) as map(xs:string,item())*
{
  let $options := util:merge-into($this:DEFAULT-OPTIONS, $options)
  let $codepoints := $codepoint-function($text, $trie("config"))
  let $collected-emits := 
    fold-left(
      1 to count($codepoints),
      $trie=>this:get-state($trie("root")),
      function ($emits-and-state as map(*)*, $pos as xs:integer) as map(*)* {
        let $cur-state := $emits-and-state[last()]
        let $emits := $emits-and-state[position() < last()]
        let $new-state := $cur-state=>this:get-state($trie, $codepoints[$pos])
        return (
          $emits,
          this:get-emits($trie, $new-state, $pos),
          if (empty($new-state)) then this:empty-state() else $new-state
        )
      }
    )[position() < last()]
  let $incompletes-purged :=
    if ($options("only-whole-words")) then (
      let $regex := $options("word-bounds-regex")
      return this:remove-partial-matches($text, $collected-emits, $regex)
    ) else $collected-emits
  let $overlaps-purged :=
    if ($options("allow-overlaps"))
    then $incompletes-purged
    else (
      let $tree := this:node($incompletes-purged)
      return this:remove-overlaps($tree, $incompletes-purged)
    )
  for $emit in $overlaps-purged
  order by this:start($emit) ascending, this:end($emit) ascending
  return $emit
};

(:======================================================================:
 : Accessors
 :======================================================================:)

(:~ 
 : start()
 : Accessor for starting string offset of emitted match in the text.
 : 
 : @param $emit: match returned from get-matches()
 :)
declare function this:start($emit as map(xs:string,item())) as xs:integer
{
  $emit("start")
};

(:~ 
 : end()
 : Accessor for ending string offset of emitted match in the text.
 : 
 : @param $emit: match returned from get-matches()
 :)
declare function this:end($emit as map(xs:string,item())) as xs:integer
{
  $emit("end")
};

(:~ 
 : keyword()
 : Accessor for the matching word in the emitted match. 
 : 
 : @param $emit: match returned from get-matches()
 :)
declare function this:keyword($emit as map(xs:string,item())) as xs:string
{
  $emit("keyword")
};

(:~ 
 : index()
 : Accessor for keyword index of emitted match. This is the ordinal number
 : of the added word. It can be used to correlate the match with other 
 : metadata.
 : 
 : @param $emit: match returned from get-matches()
 :)
declare function this:index($emit as map(xs:string,item())) as xs:integer
{
  $emit("index")
};

(:~
 : interval-size()
 : Convenience function that computes the length of the match.
 :
 : @param $emit: match returned from get-matches()
 :)
declare function this:interval-size($emit as map(xs:string,item())) as xs:integer
{
  this:end($emit) - this:start($emit) + 1
};

(:~
 : keyword()
 : Accessor for keywords in the trie.
 : Not very efficient. Alternative: keep a full list of the keywords.
 : This is only really useful for replace() with a $N pattern, which is 
 : a low probability use case, honestly.
 :
 : @param $trie: the trie
 : @param $index: keyword index
 :)
declare function this:keyword(
  $trie as map(xs:string,item()),
  $index as xs:integer
) as xs:string
{
  let $num-states := $trie=>map:get("num-states")
  return
    tail(
      util:while(
        function ($state as xs:integer, $keyword as xs:string) as xs:boolean { empty($keyword) and $state < $num-states },
        function ($state as xs:integer, $keyword as xs:string) as item()* {
          $state + 1,
          let $emits := $trie=>this:get-state($state)=>map:get("emits")
          for $keyword in $emits=>map:keys()
          where $emits=>map:get($keyword)=$index
          return $keyword
        },
        1, ()
      )
    )
};

(:======================================================================:
 : Codepoint functions
 :======================================================================:)

(:~
 : codepoints()
 : Default codepoint function. Does a string-to-codepoints, possibly with
 : mapping to lower case followed by Unicode normalization, depending on 
 : configuration parameters.
 :
 : @param $string: input string
 : @param $config: configuration parameters
 :   "case-insensitive": boolean, whether to map to lowercase (default=false())
 :   "unicode-normalization": string, one of "none" or one of the Unicode
 :      normalization forms accepted by normalize-unicode(), whether to perform
 :      Unicode normalization on the string
 :)
declare function this:codepoints(
  $string as xs:string,
  $config as map(xs:string, item())
) as xs:integer*
{
  string-to-codepoints(
    let $case-insensitive := ($config("case-insensitive"),false())[1]
    let $normalization := ($config("unicode-normalization"),"none")[1]
    let $case-string :=
      if ($case-insensitive) then lower-case($string) else $string
    return
      if ($normalization="none")
      then $case-string
      else normalize-unicode($case-string, $normalization)
  )
};

(:======================================================================:
 : Match and process
 :======================================================================:)

(:~
 : extract()
 : Extract all the matches from the text nodes in the input using the
 : default match options, and return them as handled by the extract function.
 :
 : @param $input: nodes to process
 : @param $trie: the trie
 : @param $extract-function: a function to process each match
 :   The function is given the matching text from the input and the match itself
 :)
declare function this:extract(
  $input as node()*,
  $trie as map(xs:string,item()),
  $extract-function as
    function((:text:)xs:string,
             (:match:)map(xs:string,item())) as node()*
) as node()*
{
  this:extract($input, $trie, $extract-function, map {})
};

(:~
 : extract()
 : Extract all the matches from the text nodes in the input using the
 : given match options, and return them as handled by the extract function.
 :
 : @param $input: nodes to process
 : @param $trie: the trie
 : @param $extract-function: a function to process each match
 :   The function is given the matching text from the input and the match itself
 : @param $match-options: options to use for matching keywords 
:)
declare function this:extract(
  $input as node()*,
  $trie as map(xs:string,item()),
  $extract-function as
    function((:text:)xs:string,
             (:match:)map(xs:string,item())) as node()*,
  $match-options as map(xs:string,item()*)?
) as node()*
{
  let $options := util:merge-into($this:DEFAULT-OPTIONS, $match-options)
  for $text in $input/text()
  for $match in $trie=>this:get-matches($text, $options)
  return (
    $extract-function(
      substring($text, this:start($match), this:interval-size($match)),
      $match
    )
  )
};

(:~
 : markup()
 : Walk the input nodes and return them with all the matches replaced
 : with the output of the extract function over the given match. Use the
 : default match options.
 :
 : @param $input: nodes to process
 : @param $trie: the entity trie
 : @param $markup-function: a function to process each match
 :   The function is given the matching text from the input and the match itself
 :)
declare function this:markup(
  $input as node()*,
  $trie as map(xs:string,item()),
  $markup-function as
    function((:text:)xs:string,
             (:match:)map(xs:string,item())) as node()*
) as node()*
{
  this:markup($input, $trie, $markup-function, map {})
};

(:~
 : markup()
 : Walk the input nodes and return them with all the matches replaced
 : with the output of the extract function over the given match. Use the
 : given match options.
 :
 : @param $input: nodes to process
 : @param $trie: the entity trie
 : @param $markup-function: a function to process each match
 :   The function is given the matching text from the input and the match itself
 : @param $match-options: options to use for matching keywords
 :)
declare function this:markup(
  $input as node()*,
  $trie as map(xs:string,item()),
  $markup-function as
    function((:text:)xs:string,
             (:match:)map(xs:string,item())) as node()*,
  $match-options as map(xs:string,item()*)?
) as node()*
{
  let $options := util:merge-into($this:DEFAULT-OPTIONS, $match-options)
  for $node in $input return typeswitch($node)
  case document-node() return
    document {
      this:markup($node/node(), $trie, $markup-function, $match-options)
    }
  case element() return
    element {node-name($node)} {
      this:markup($node/node(), $trie, $markup-function, $match-options)
    }
  case text() return
    let $text := string($node)
    let $matches := $trie=>this:get-matches($text, $options)
    let $markup :=
      fold-left(
        $matches, (1),
        function ($markup as item()*, $match as map(xs:string,item())*) as item()* {
          let $pos := head($markup)
          let $next := this:start($match)
          return (
            this:end($match)+1, tail($markup), (
              if ($next > $pos) then (
                text { substring($text, $pos, $next - $pos) }
              ) else (),
              $markup-function(
                substring($text, this:start($match), this:interval-size($match)),
                $match
              )
            )
          )
        }
      )
    return (
      tail($markup),
      text { substring($text, head($markup)) }
    )
  default return $node
};

(:======================================================================:
 : Some markup functions
 :======================================================================:)

(:~
 : normalize()
 : Replace the matching text with the keyword
 : If we did case folding or other codepoint manipulations this can make
 : a difference.
 :)
declare %art:markup function this:normalize(
  $matching as xs:string,
  $match as map(xs:string,item())
) as node()*
{
  text {this:keyword($match)}
};

(:~
 : remove()
 : Remove the matching text.
 :)
declare %art:markup function this:remove(
  $matching as xs:string,
  $match as map(xs:string,item())
) as node()*
{
  ()
};

(:~
 : full-entity()
 : Wrap the matching text in markup. The element will include attributes
 : with the keyword id, the start position of the match (useful for extract()),
 : and the matching keyword (useful if we did case folding or other codepoint
 : mapping). 
 :)
declare %art:markup function this:full-element(
  $matching as xs:string,
  $match as map(xs:string,item())
) as node()*
{
  element {QName("http://mathling.com/entity", "e:keyword")} {
    attribute id {this:index($match)},
    attribute keyword {this:keyword($match)},
    attribute start {this:start($match)},
    $matching
  }
};

(:~
 : basic-entity()
 : Wrap the matching text in markup. The element will include an attribute
 : with the keyword id.
 :)
declare %art:markup function this:basic-element(
  $matching as xs:string,
  $match as map(xs:string,item())
) as node()*
{
  element {QName("http://mathling.com/entity", "e:keyword")} {
    attribute id {this:index($match)},
    $matching
  }
};

(:======================================================================
 : Load and rebuild saved trie
 :======================================================================:)

(:~
 : load-compiled()
 : Load a compiled trie. Return a completed trie, with the states computed.
 : 
 : @param $file: file containing the saved compiled trie
 :)
declare function this:load-compiled($file as xs:string) as map(xs:string,item())
{
  this:parse(unparsed-text($file))
};

declare %private variable $this:SAVE-IMPL as function(xs:string,xs:string) as item()* :=
  let $write-text := function-lookup(QName("http://expath.org/ns/file","write-text"), 2)
  return (
    if (empty($write-text)) then (
      function ($file as xs:string, $contents as xs:string) as empty-sequence() {
        errors:error("ML-UNAVAILABLE", "file:write-text")
      }
    ) else $write-text
  )
;

(:~ 
 : save-compiled()
 : Save a compiled trie to a file. Requires Saxon-PE or Saxon-EE. If
 : you have Saxon-HE set query output method to text and use serialize()
 : directly.
 : 
 : @param $to: file to write to
 : @param $trie: completed trie to save
 :)
declare function this:save-compiled(
  $to as xs:string, 
  $trie as map(xs:string,item())
) as empty-sequence()
{
  $this:SAVE-IMPL($to, serialize($trie, map {"method": "json"}))
};

(:~
 : parse()
 : Parse a compiled trie. Return a completed trie, with the states computed.
 : 
 : @param $compiled: string containing the compiled format
 :)
declare function this:parse($compiled as xs:string) as map(xs:string,item())
{
  (: Fix up raw JSON because all integers got turned into doubles, alas :)
  this:fix-trie(parse-json($compiled))
};

(:~
 : fix-trie()
 : Fix a trie read directly from JSON. Useful when wrapping compiled trie
 : with more data.
 :
 : @param $raw: the raw map parsed from JSON
 :)
declare function this:fix-trie(
  $raw as map(xs:string, item())
) as map(xs:string,item())
{
  $raw=>
    map:put("root", 1)=>
    map:put("num-states", xs:integer($raw("num-states")))=>
    map:put("num-keywords", xs:integer($raw("num-keywords")))=>
    map:put("states", this:fix-states($raw("states")))
};

(:~
 : compile()
 : Serialize the trie in a format suitable for parse() to unpack.
 : 
 : @param $trie: the completed trie
 :)
declare function this:compile($trie as map(xs:string,item())) as xs:string
{
  serialize($trie, map {"method": "json"})
};

(:======================================================================:
 : Internal
 :======================================================================:)

(:~
 : get-emits()
 : Trie function to return the emits for the given state, if any.
 : The set of emits returned is the combination of emits on the state itself
 : plus (recursively) the emits on the state for the output link (if any).
 : 
 : @param $trie: the trie
 : @param $state: the state we're getting emits from
 : @param $pos: current position in the input string
 :)
declare %private function this:get-emits(
  $trie as map(xs:string,item()),
  $state as map(xs:string,item()),
  $pos as xs:integer
) as map(xs:string,item())*
{
  if (
    $state("output")=0 and
    empty($state("emits")=>map:keys())
  ) then () else (
    for $emit in $state("emits")
    for $keyword in $emit=>map:keys()
    let $size := xs:integer($trie("size-map")($keyword))
    let $indexes := array:flatten($emit($keyword))
    let $intervals :=
      for $index in $indexes return (
        this:emit($pos - $size + 1, $pos, $keyword, $index)
      )
    return $intervals,
    if ($state("output")=0) then () else (
      this:get-emits($trie, $trie=>this:get-state($state("output")), $pos)
    )
  )
};

(:======================================================================
 : State array data type
 :======================================================================:)

(:~
 : state-array()
 : Construct the state array
 :
 : @param $zero: the value of an uninitialized state 
 :)
declare %private function this:state-array(
  $zero as map(xs:string,item())
) as map(xs:integer,item()*)
{
  map { 0: $zero }
};

(:~
 : array-get()
 : Accessor for a state in the state array. 
 :
 : @param $array: the state array
 : @param $i: index of 
 :)
declare %private function this:array-get(
  $array as map(xs:integer,item()*),
  $i as xs:integer
) as map(xs:string,item())
{
  util:assert($i > 0, "Index must be positive"),
  (map:get($array, $i),map:get($array,0))[1]
};

(:~
 : array-put()
 : Setter for state in the state array. Update the state and return the new
 : state array.
 :
 : @param $array: the state array
 : @param $i: the index of the state to update in the state array
 : @param $value: new value for the state
 :)
declare %private function this:array-put(
  $array as map(xs:integer,item()*),
  $i as xs:integer,
  $value as map(xs:string,item())
) as map(xs:integer,item()*)
{
  $array=>map:put($i, $value)
};

(:======================================================================
 : State data type
 :======================================================================:)

(:~
 : state()
 : Constructor for a state.
 :
 : @param $depth: depth of state in the state array (0=root)
 : @param $id: index of this state in the state array
 :)
declare %private function this:state(
  $depth as xs:integer,
  $id as xs:integer
) as map(xs:string,item())
{
  map {
    "id": $id,         (: index in state array :)
    "depth": $depth,   (: depth in trie; 0=root :)
    "success": map {}, (: codepoint string to state ix :)
    "failure": 1,      (: failure state :)
    "output": 0,       (: output link; 0=none :)
    "emits": map {}    (: keyword to indexes :)
  }
};

(:~
 : empty-state()
 : An out-of-bands empty state
 :)
declare %private function this:empty-state() as map(xs:string,item())
{
  map {}
};

(:~
 : empty-state()
 : Is the state the empty state?
 : 
 : @param $state: state to test
 :)
declare %private function this:empty-state($state as map(xs:string,item())) as xs:boolean
{
  empty($state=>map:keys())
};

(:======================================================================
 : State update/fetch functions
 :======================================================================:)

(:~
 : get-state()
 : Get the state from the trie.
 :
 : @param $trie: the trie
 : @param $i: id of the state (index in the state array)
 :)
declare %private function this:get-state(
  $trie as map(xs:string,item()),
  $i as xs:integer
) as map(xs:string,item())
{
  $trie("states")=>this:array-get($i)
};

(:~
 : put-state()
 : Set the state to a new value.
 : 
 : @param $trie: the trie
 : @param $i: id of the state (index in state array)
 : @param $state: new value for the state
 :)
declare %private function this:put-state(
  $trie as map(xs:string,item()),
  $i as xs:integer,
  $state as map(xs:string,item())
) as map(xs:string,item())
{
  util:assert($i=$state=>map:get("id"), "Mismatched state ID"),
  $trie=>map:put("states", $trie("states")=>this:array-put($i, $state))
};

(:~
 : next-index()
 : New state ID. We mint state IDs as consecutive integers and rely on the
 : state count to be updated properly.
 : 
 : @param $trie: the trie
 :)
declare %private function this:next-index(
  $trie as map(xs:string,item())
) as xs:integer
{
  $trie("num-states")+1
};


(:~
 : next-state()
 : Return successor state for the given state along the given edge.
 : If there is no successor state for the given edge but this is the root state,
 : return the root state itself. Otherwise return the empty state.
 : 
 : @param $state: current state
 : @param $trie: trie holding state array
 : @param $codepoint: which edge to follow (edges are labeled with codepoints) 
 :)
declare %private function this:next-state(
  $state as map(xs:string,item()),
  $trie as map(xs:string,item()),
  $codepoint as xs:integer
) as map(xs:string,item())
{
  $state=>this:next-state($trie, $codepoint, false())
};

(:~
 : next-state-ignore-root-state()
 : Return successor state for the given state along the given edge.
 : If there is no successor state for the given edge return the empty state.
 : 
 : @param $state: current state
 : @param $trie: trie holding state array
 : @param $codepoint: which edge to follow (edges are labeled with codepoints) 
 :)
declare %private function this:next-state-ignore-root-state(
  $state as map(xs:string,item()),
  $trie as map(xs:string,item()),
  $codepoint as xs:integer
) as map(xs:string,item())
{
  $state=>this:next-state($trie, $codepoint, true())
};

(:~
 : next-state()
 : Return successor state for the given state along the given edge.
 : If there is no successor state for the given edge follow the rules
 : for returning either the root or the empty state per the flag. (See
 : next-state() and next-state-ignore-root-state().
 : 
 : @param $state: current state
 : @param $trie: trie holding state array
 : @param $codepoint: which edge to follow (edges are labeled with codepoints) 
 : @param $ignore-root-state: whether to special case the root state if there
 :    if no edge with the given label
 :)
declare %private function this:next-state(
  $state as map(xs:string,item()),
  $trie as map(xs:string,item()),
  $codepoint as xs:integer,
  $ignore-root-state as xs:boolean
) as map(xs:string,item())
{
  let $found := $state("success")($codepoint)
  return (
    if (exists($found))
    then $trie=>this:get-state($found)
    else if (not($ignore-root-state) and
      exists($state("depth")) and $state("depth")=0
    )
    then $trie=>this:get-state($trie("root"))
    else this:empty-state()
  )
};

(:~ 
 : add-state()
 : Add a successor state to the trie, if necessary. If a successor to the current
 : state exists along the given edge, do nothing; if it doesn't, create the new
 : success state and update the trie. Returns both the updated trie and the new
 : state, in that order. (A bit of a hack to get around non-side-effecting maps.)
 :
 : @param $state: current state
 : @param $trie: trie holding the state array
 : @param $codepoint: edge to follow to get to the successor.
 :)
declare %private function this:add-state(
  $state as map(xs:string,item()),
  $trie as map(xs:string,item()),
  $codepoint as xs:integer
) as map(xs:string,item())*
{
  let $next := $state=>this:next-state-ignore-root-state($trie, $codepoint)
  return (
    if (this:empty-state($next)) then (
      let $new-ix := $trie=>this:next-index()
      let $new := this:state($state("depth") + 1, $new-ix)
      let $updated-trie := 
        this:put-state($trie, $state("id"),
          $state=>map:put("success",
            $state("success")=>map:put($codepoint, $new-ix)
          )
        )
      let $updated-trie :=
        $updated-trie=>
          this:put-state($new-ix, $new)=>
          map:put("num-states", $new-ix)
      return (
        $updated-trie,
        $new
      )
    ) else (
      $trie,
      $next
    )
  )
};

(:~
 : add-emit()
 : Add the given keyword/index pair to the set to output for the given state.
 : Return the updated state.
 :
 : @param $state: the state to update
 : @param $keyword: keyword to add to output set for this state
 : @param $index: index of keyword to add to output set for this state
 :)
declare %private function this:add-emit(
  $state as map(xs:string,item()),
  $keyword as xs:string,
  $index as xs:integer
) as map(xs:string,item())
{
  let $state-emits := 
    let $state-emits := $state("emits")
    return
      $state-emits=>map:put($keyword,
        if (empty($state-emits($keyword))) then array {$index}
        else array:append($state-emits($keyword),$index)
      )
  return
    $state=>map:put("emits", $state-emits)
};

(:~
 : get-states()
 : Get the set of successor states for the given state.
 :
 : @param $state: current state
 : @param $trie: trie holding the state array
 :)
declare %private function this:get-states(
  $state as map(xs:string,item()),
  $trie as map(xs:string,item())
) as map(xs:string,item())*
{
  let $success := $state("success")
  for $key in $success=>map:keys()
  return $trie=>this:get-state($success($key))
};

(:~
 : get-transitions()
 : Get the set of edge labels from the given state.
 : 
 : @param $state: current state
 :)
declare %private function this:get-transitions($state) as xs:integer*
{
  ($state("success")=>map:keys())
};

(:======================================================================
 : Failure state calculation
 :======================================================================:)

(:~
 : construct-failure-states()
 : Calculate failure states and output links for the trie. Return the new trie.
 :
 : @param $trie: trie to update
 :)
declare %private function this:construct-failure-states(
  $trie as map(xs:string,item())
) as map(xs:string,item())
{
  let $trie :=
    fold-left(
      $trie=>this:get-state($trie("root"))=>this:get-states($trie),
      $trie,
      function ($trie as map(xs:string,item()), $cur-state as map(xs:string,item())) as map(xs:string,item()) {
        let $ix := $cur-state("id")
        return (
          $trie=>this:put-state($ix, 
            $cur-state=>map:put("failure", $trie("root"))
          )
        )
      }
    )
  let $trie :=
    fold-left(
      $trie=>this:get-state($trie("root"))=>this:get-states($trie),
      $trie,
      function ($trie as map(xs:string,item()), $cur-state as map(xs:string,item())) as map(xs:string,item()) {
        $trie=>this:construct-failure-states($cur-state)
      }
    )
  return $trie=>map:put("constructed-failure-states", true())
};

(:~
 : construct-failure-states()
 : Construct failure states for the all the transitions from the given state in the
 : trie; return the updated trie.
 :
 : @param $trie: the trie
 : @param $state: current state
 :)
declare %private function this:construct-failure-states(
  $trie as map(xs:string,item()),
  $state as map(xs:string,item())
) as map(xs:string,item())
{
  let $transitions := $state=>this:get-transitions()
  let $trie :=
    fold-left(
      $transitions,
      $trie,
      function ($trie as map(xs:string,item()), $transition as xs:integer) as map(xs:string, item()*) {
        let $target-state := $state=>this:next-state($trie, $transition)
        let $trace-failure-state := $state=>this:get-failure-state($trie, $transition)
        let $new-failure-state := $trace-failure-state=>this:next-state($trie, $transition)
        let $output-link :=
          if ($new-failure-state("output")!=0 and
              empty($new-failure-state("emits")=>map:keys()))
          then $new-failure-state("output")
          else $new-failure-state("id")
        let $ix := $target-state("id")
        let $target-state :=
          $target-state=>
            map:put("failure", $new-failure-state("id"))=>
            map:put("output", $output-link)
        return (
          $trie=>this:put-state($ix, $target-state)
        )
      }
    )
  return
    fold-left(
      $transitions,
      $trie,
      function ($trie as map(xs:string,item()), $transition as xs:integer) as map(xs:string, item()*) {
        let $target-state := $state=>this:next-state($trie, $transition)
        return (
          $trie=>this:construct-failure-states($target-state)
        )
      }
    )
};

(:~
 : get-state()
 : Compute the successor state for the given codepoint. This is either a success
 : edge with the given label or the successor to the failure state with the given
 : label.
 :
 : @param $state: current state
 : @param $trie: trie holding the state array
 : @param $codepoint: label for edge to follow
 :)
declare %private function this:get-state(
  $state as map(xs:string,item()),
  $trie as map(xs:string,item()),
  $codepoint as xs:integer
) as map(xs:string,item())
{
  let $result := $state=>this:next-state($trie, $codepoint)
  return
    if (this:empty-state($result))
    then (
      let $failure := $state("failure")
      let $failure-state := $trie=>this:get-state($failure)
      return
        $failure-state=>this:get-state($trie, $codepoint)
    )
    else $result
};

(:~
 : get-failure-state()
 : Calculate the failure state for the current state given an edge label.
 : This is either the successor state with that edge from the failure state
 : of the target state, or recursively it's failure's state edge.
 :
 : @param $state: current state
 : @param $trie: trie holding the state array
 : @param $codepoint: edge label
 :)
declare %private function this:get-failure-state(
  $state as map(xs:string,item()),
  $trie as map(xs:string,item()),
  $codepoint as xs:integer
) as map(xs:string,item())?
{
  let $trace-failure-state := $trie=>this:get-state($state("failure"))
  let $next-state := $trace-failure-state=>this:next-state($trie,$codepoint)
  return
    if (this:empty-state($next-state))
    then this:get-failure-state($trie=>this:get-state($trace-failure-state("failure")), $trie, $codepoint)
    else $trace-failure-state
};

(:======================================================================
 : Interval data type 
 :======================================================================:)

(:~
 : interval()
 : Construct an interval. An emit is a subtype of an interval, but intervals
 : are used on their own in overlap pruning.
 :
 : @param $start: starting position of the interval
 : @param $end: ending position of the interval 
 :)
declare %private function this:interval(
  $start as xs:integer,
  $end as xs:integer
) as map(xs:string,item())
{
  util:assert($start <= $end, "Bad offsets for interval"),
  map {
    "start": $start,
    "end": $end
  }
};

(:~
 : overlaps-with()
 : Does one interval overlap another?
 :
 : @param $interval: one interval to compare
 : @param $other: the other interval to compare
 :)
declare %private function this:overlaps-with(
  $interval as map(xs:string,item()),
  $other as map(xs:string,item())
) as xs:boolean
{
  (this:start($interval) le this:end($other)) and
  (this:end($interval) ge this:start($other))
};

(:======================================================================
 : Emit subtype
 :======================================================================:)

(:~
 : emit()
 : Constructor for an emit (representation of a match).
 : 
 : @param $start: starting position of match in string
 : @param $end: ending position of match in string
 : @param $keyword: matching keyword
 : @param $index: index of matching keyword
 :)
declare %private function this:emit(
  $start as xs:integer,
  $end as xs:integer,
  $keyword as xs:string,
  $index as xs:integer
) as map(xs:string,item())
{
  this:interval($start,$end)=>
    map:put("keyword", $keyword)=>
    map:put("index", $index)
};

(:~
 : equal()
 : Equality of emits
 :
 : @param $this: one emit to compare
 : @param $that: the other
 :)
declare %private function this:equal(
  $this as map(xs:string,item()),
  $that as map(xs:string,item())
) as xs:boolean
{
  (: We don't need to check if index is equal so is keyword :)
  $this("index")=$that("index") and
  $this("start")=$that("start") and
  $this("end")=$that("end")
};

(:======================================================================
 : Partial match processing
 :======================================================================:)

(:~
 : remove-partial-matches()
 : Remove any matches internal to a word, where a word boundary is defined
 : by the given regular expression. Return matches that are not pruned.
 :
 : @param $search-text: text over which we constructed the matches
 : @param $emits: set of matches to prune
 : @param $regex: word boundary regular expression
 :)
declare %private function this:remove-partial-matches(
  $search-text as xs:string,
  $emits as map(xs:string,item())*,
  $regex as xs:string
) as map(xs:string,item())*
{
  for $interval in $emits
  let $before := substring($search-text, $interval("start") - 1, 1)
  let $after := substring($search-text, $interval("end") + 1, 1)
  return (
    if (
      ($before="" or matches($before, $regex)) and
      ($after="" or matches($after, $regex))
    ) then (
      $interval
    ) else (
      ()
    )
  )
};

(:======================================================================
 : Overlap processing
 :======================================================================:)

(:~
 : node()
 : Constructor for a node. For overlap removal we construct a tree of 
 : intervals consisting of these nodes.
 :
 : @param $allow-overlaps: whether we allow overlaps: a workaround
 : @param $intervals: intervals subsumed by this node
 :)
declare %private function this:node(
  $intervals as map(xs:string,item())*
) as map(xs:string,item()*)?
{
  let $point := this:determine-median($intervals)
  let $to_left := for $i in $intervals where this:end($i) lt $point return $i
  let $left := if (empty($to_left)) then () else this:node($to_left)
  let $to_right := for $i in $intervals where this:start($i) gt $point return $i
  let $right := if (empty($to_right)) then () else this:node($to_right)
  let $neither :=
    for $i in $intervals
    where this:start($i) le $point and this:end($i) ge $point
    return $i
  return
    map {
      "location": $point,
      "left": $left,
      "right": $right,
      "intervals": $neither
    }
};

(:~
 : determine-median()
 : Calculate center position of median interval.
 :
 : @param $intervals: set of intervals to process
 :)
declare %private function this:determine-median(
  $intervals as map(xs:string,item())*
) as xs:integer
{
  let $median-interval := 
    fold-left(
      $intervals,
      this:interval(-1,-1),
      function ($current as map(xs:string,item()), $interval as map(xs:string,item())) as map(xs:string,item()) {
        this:interval(
          if (this:start($current) eq -1 or this:start($interval) lt this:start($current))
          then this:start($interval)
          else this:start($current)
          ,
          if (this:end($current) eq -1 or this:end($interval) gt this:end($current))
          then this:end($interval)
          else this:end($current)
        )
      }
    )
  return (this:start($median-interval) + this:end($median-interval)) idiv 2
};

(:~
 : find-overlaps()
 : Find the set of overlapping intervals for the give interval
 : 
 : @param $node: node holding intervals
 : @param $emit: interval to check against $node
 :)
declare %private function this:find-overlaps(
  $node as map(xs:string,item()*)?,
  $emit as map(xs:string,item())
) as map(xs:string,item())*
{
  if (empty($node)) then () else
  if ($node("location") lt this:start($emit)) then (
    (
      this:find-overlaps($node("right"), $emit),
      this:check-overlaps($node, $emit, "right")
    )[not(this:equal($emit, .))]
  ) else if ($node("location") gt this:end($emit)) then (
    (
      this:find-overlaps($node("left"), $emit),
      this:check-overlaps($node, $emit, "left")
    )[not(this:equal($emit, .))]
  ) else (
    (
      $node("intervals"),
      this:find-overlaps($node("left"), $emit),
      this:find-overlaps($node("right"), $emit)
    )[not(this:equal($emit, .))]
  )
};

(:~
 : check-overlaps()
 : Base case for find-overlaps for recursing the right way in the node tree.
 :
 : @param $node: interval node
 : @param $emit: match to check
 : @param $direction: left or right, which subtree to check
 :)
declare %private function this:check-overlaps(
  $node as map(xs:string,item()*),
  $emit as map(xs:string,item()),
  $direction as xs:string
) as map(xs:string,item())*
{
  switch ($direction)
  case "left" return (
    ($node("intervals"))[this:start(.) le this:end($emit)]
  )
  case "right" return (
    ($node("intervals"))[this:end(.) ge this:start($emit)]
  )
  default return (
    errors:error("ML-BADARGS", ("direction",$direction))
  )
};

(:~
 : remove-overlaps()
 : Find set of overlaps from a set of matches, and return the set with the
 : overlaps pruned out. We prefer to keep longer matches and for matches with
 : the same length, those with a lower starting point.
 : 
 : @param $node: interval node tree containing the matches
 : @param $overlaps: set of overlapping intervals
 :)
declare %private function this:remove-overlaps(
  $node as map(xs:string,item()*)?,
  $overlaps as map(xs:string,item())*
) as map(xs:string,item())*
{
  if (empty($node)) then () else
  (: Prefer longer matches, prefer matches with lower starts :)
  (: Return in starting position order :)
  let $sorted :=
    for $emit in $overlaps
    order by this:interval-size($emit) descending, this:start($emit) ascending
    return $emit
  let $to-remove :=
    fold-left(
      $sorted,
      (),
      function ($to-remove as map(xs:string,item())*, $emit as map(xs:string,item())) as map(xs:string,item())* {
        $to-remove,
        if (every $other in $to-remove satisfies not(this:equal($other, $emit)))
        then (
          this:find-overlaps($node, $emit)
        ) else ()
      }
    )
  for $emit in $sorted
  where every $other in $to-remove satisfies not(this:equal($other,$emit))
  return $emit
};


(:======================================================================:
 : Trie load fix-up
 :======================================================================:)
 
declare %private function this:fix-states($raw as map(*)) as map(xs:integer, item()*)
{
  fold-left(
    $raw=>map:keys(),
    map {},
    function ($fixed as map(xs:integer, item()*), $key) as map(xs:integer, item()*) {
      if (xs:integer($key)=0) then $fixed=>map:put(xs:integer($key), $raw($key))
      else $fixed=>map:put(xs:integer($key), this:fix-state($raw($key)))
    }
  )
};

declare %private function this:fix-state($raw as map(xs:string, item())) as map(xs:string, item())
{
  if (empty($raw=>map:keys())) then $raw
  else
    let $fixed := 
      $raw=>
        map:put("id", xs:integer($raw("id")))=>
        map:put("depth", xs:integer($raw("depth")))=>
        map:put("success", this:fix-success($raw("success")))=>
        map:put("failure", xs:integer($raw("failure")))=>
        map:put("emits", this:fix-emits($raw("emits")))
    return
      if (empty($raw("output")))
      then $fixed
      else $fixed=>map:put("output", xs:integer($raw("output")))
};

declare %private function this:fix-success($raw as map(*)?) as map(xs:integer, xs:integer)?
{
  if (empty($raw)) then $raw else
  fold-left(
    $raw=>map:keys(),
    map {},
    function ($fixed as map(xs:integer, xs:integer), $key) as map(xs:integer, xs:integer) {
      $fixed=>map:put(xs:integer($key), xs:integer($raw($key)))
    }
  )
};

declare %private function this:fix-emits($raw as map(*)?) as map(xs:string, array(xs:integer))
{
  if (empty($raw)) then map {}
  else
  fold-left(
    $raw=>map:keys(),
    $raw,
    function ($fixed as map(*), $key as xs:string) as map(*)
    {
      $fixed=>map:put($key, array { array:flatten($raw($key))!xs:integer(.) })
    }
  )
};