http://mathling.com/string/entity library module
http://mathling.com/string/entity
Simple entity matching.
Each item in the entity table consists of an identifier, a normalized string,
a match string, and a 'type'. Matching is based on the match string while
the index returned from matching can be used to look up the other information
from the table. There can be multiple items with the same ID, representing
different alternatives for the same unique concept. There can be multiple
items with the same match string, representing different concepts for the
same lemma. The type and ID can also be linked to semantic information
(e.g. RDF) to drive more advanced processing.
The main markup() and extract() functions take a function to decide what
to make of the matching entity. Some sample functions are included.
Copyright© Mary Holstege 2020-2023
CC-BY (https://creativecommons.org/licenses/by/4.0/)
Imports
http://mathling.com/core/utilitiesimport module namespace util="http://mathling.com/core/utilities" at "../core/utilities.xqy"http://mathling.com/string/aho
import module namespace aho="http://mathling.com/string/aho" at "../string/aho.xqy"http://mathling.com/core/errors
import module namespace errors="http://mathling.com/core/errors" at "../core/errors.xqy"
Variables
Variable: $DEFAULT-CONFIG as map(xs:string,item()*)
Default config: case-sensitive
Variable: $DEFAULT-OPTIONS as map(xs:string,item()*)
Default options no overlaps, no partial words
Functions
Function: entity
declare function entity($id as xs:string,
$normalized as xs:string,
$word as xs:string,
$type as xs:string) as map(xs:string,xs:string)
declare function entity($id as xs:string, $normalized as xs:string, $word as xs:string, $type as xs:string) as map(xs:string,xs:string)
entity()
Constructor for entity item.
Params
- id as xs:string: unique identifier
- normalized as xs:string: normalized form of entity
- word as xs:string: match string, a lemma
- type as xs:string: type string
Returns
- map(xs:string,xs:string)
declare function this:entity( $id as xs:string, $normalized as xs:string, $word as xs:string, $type as xs:string ) as map(xs:string,xs:string) { map { "id": $id, "normalized": $normalized, "word": $word, "type": $type } }
Function: id
declare function id($entity as map(xs:string, xs:string)) as xs:string
declare function id($entity as map(xs:string, xs:string)) as xs:string
id()
Accessor for unique ID.
Params
- entity as map(xs:string,xs:string): the entity
Returns
- xs:string
declare function this:id($entity as map(xs:string, xs:string)) as xs:string { $entity("id") }
Function: normalized
declare function normalized($entity as map(xs:string, xs:string)) as xs:string
declare function normalized($entity as map(xs:string, xs:string)) as xs:string
normalized()
Accessor for normalized form of entity.
Params
- entity as map(xs:string,xs:string): the entity
Returns
- xs:string
declare function this:normalized($entity as map(xs:string, xs:string)) as xs:string { $entity("normalized") }
Function: word
declare function word($entity as map(xs:string, xs:string)) as xs:string
declare function word($entity as map(xs:string, xs:string)) as xs:string
word()
Accessor for match string.
Params
- entity as map(xs:string,xs:string): the entity
Returns
- xs:string
declare function this:word($entity as map(xs:string, xs:string)) as xs:string { $entity("word") }
Function: type
declare function type($entity as map(xs:string, xs:string)) as xs:string
declare function type($entity as map(xs:string, xs:string)) as xs:string
type()
Accessor for the type string.
Params
- entity as map(xs:string,xs:string): the entity
Returns
- xs:string
declare function this:type($entity as map(xs:string, xs:string)) as xs:string { $entity("type") }
Function: dictionary
declare function dictionary($entities as map(xs:string,xs:string)*) as map(xs:string,item()*)
declare function dictionary($entities as map(xs:string,xs:string)*) as map(xs:string,item()*)
dictionary()
Constructor for entity dictionary
Params
- entities as map(xs:string,xs:string)*: entity items in the table
Returns
- map(xs:string,item()*)
declare function this:dictionary( $entities as map(xs:string,xs:string)* ) as map(xs:string,item()*) { map { "trie": aho:trie( $this:DEFAULT-CONFIG, for $entity in $entities return this:word($entity) ), "entities": array { $entities } } }
Function: dictionary
declare function dictionary($entities as map(xs:string,xs:string)*,
$config as map(xs:string,item()*)) as map(xs:string,item()*)
declare function dictionary($entities as map(xs:string,xs:string)*, $config as map(xs:string,item()*)) as map(xs:string,item()*)
dictionary()
Constructor for entity dictionary
Params
- entities as map(xs:string,xs:string)*: entity items in the table
- config as map(xs:string,item()*): configuration options
Returns
- map(xs:string,item()*)
declare function this:dictionary( $entities as map(xs:string,xs:string)*, $config as map(xs:string,item()*) ) as map(xs:string,item()*) { map { "trie": aho:trie( util:merge-into($this:DEFAULT-CONFIG, $config), for $entity in $entities return this:word($entity) ), "entities": array { $entities } } }
Function: extract
declare function extract($dictionary as map(xs:string,item()*),
$input as node()*,
$extract-function as
function((:text:)xs:string,
(:match:)map(xs:string,item()),
(:entity:)map(xs:string,xs:string)) as node()*) as node()*
declare function extract($dictionary as map(xs:string,item()*), $input as node()*, $extract-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()*) as node()*
extract()
Extract all the entities from the text nodes in the input using the
default match options, and return them as handled by the extract function.
Params
- dictionary as map(xs:string,item()*): the entity dictionary
- input as node()*: nodes to process
- extract-function as function(xs:string,map(xs:string,item()),map(xs:string,xs:string))asnode()*: a function to process each match The function is given the matching text from the input, the match itself, and the corresponding entity from the entity table in the dictionary
Returns
- node()*
declare function this:extract( $dictionary as map(xs:string,item()*), $input as node()*, $extract-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()* ) as node()* { this:extract($dictionary, $input, $extract-function, map {}) }
Function: extract
declare function extract($dictionary as map(xs:string,item()*),
$input as node()*,
$extract-function as
function((:text:)xs:string,
(:match:)map(xs:string,item()),
(:entity:)map(xs:string,xs:string)) as node()*,
$match-options as map(xs:string,item()*)?) as node()*
declare function extract($dictionary as map(xs:string,item()*), $input as node()*, $extract-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()*, $match-options as map(xs:string,item()*)?) as node()*
extract()
Extract all the entities from the text nodes in the input using the
given match options, and return them as handled by the extract function.
Params
- dictionary as map(xs:string,item()*): the entity dictionary
- input as node()*: nodes to process
- extract-function as function(xs:string,map(xs:string,item()),map(xs:string,xs:string))asnode()*: a function to process each match The function is given the matching text from the input, the match itself, and the corresponding entity from the entity table in the dictionary
- match-options as map(xs:string,item()*)?: options to use for matching keywords
Returns
- node()*
declare function this:extract( $dictionary as map(xs:string,item()*), $input as node()*, $extract-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()*, $match-options as map(xs:string,item()*)? ) as node()* { let $options := util:merge-into($this:DEFAULT-OPTIONS, $match-options) for $text in $input/text() for $match in $dictionary("trie")=>aho:get-matches($text, $options) return ( $extract-function( substring($text, aho:start($match), aho:interval-size($match)), $match, $dictionary("entities")=>array:get(aho:index($match)) ) ) }
Function: markup
declare function markup($dictionary as map(xs:string,item()*),
$input as node()*,
$markup-function as
function((:text:)xs:string,
(:match:)map(xs:string,item()),
(:entity:)map(xs:string,xs:string)) as node()*) as node()*
declare function markup($dictionary as map(xs:string,item()*), $input as node()*, $markup-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()*) as node()*
markup()
Walk the input nodes and return them with all entity matches replaced
with the output of the extract function over the given match. Use the
default match options.
Params
- dictionary as map(xs:string,item()*): the entity dictionary
- input as node()*: nodes to process
- markup-function as function(xs:string,map(xs:string,item()),map(xs:string,xs:string))asnode()*: a function to process each match The function is given the matching text from the input, the match itself, and the corresponding entity from the entity table in the dictionary
Returns
- node()*
declare function this:markup( $dictionary as map(xs:string,item()*), $input as node()*, $markup-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()* ) as node()* { this:markup($dictionary, $input, $markup-function, map {}) }
Function: markup
declare function markup($dictionary as map(xs:string,item()*),
$input as node()*,
$markup-function as
function((:text:)xs:string,
(:match:)map(xs:string,item()),
(:entity:)map(xs:string,xs:string)) as node()*,
$match-options as map(xs:string,item()*)?) as node()*
declare function markup($dictionary as map(xs:string,item()*), $input as node()*, $markup-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()*, $match-options as map(xs:string,item()*)?) as node()*
markup()
Walk the input nodes and return them with all entity matches replaced
with the output of the extract function over the given match. Use the
given match options.
Params
- dictionary as map(xs:string,item()*): the entity dictionary
- input as node()*: nodes to process
- markup-function as function(xs:string,map(xs:string,item()),map(xs:string,xs:string))asnode()*: a function to process each match The function is given the matching text from the input, the match itself, and the corresponding entity from the entity table in the dictionary
- match-options as map(xs:string,item()*)?: options to use for matching keywords
Returns
- node()*
declare function this:markup( $dictionary as map(xs:string,item()*), $input as node()*, $markup-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()*, $match-options as map(xs:string,item()*)? ) as node()* { let $options := util:merge-into($this:DEFAULT-OPTIONS, $match-options) for $node in $input return typeswitch($node) case document-node() return document { this:markup($dictionary, $node/node(), $markup-function, $options) } case element() return element {node-name($node)} { this:markup($dictionary, $node/node(), $markup-function, $options) } case text() return let $text := string($node) let $matches := $dictionary("trie")=>aho:get-matches($text, $options) let $markup := fold-left( $matches, (1), function ($markup as item()*, $match as map(xs:string,item())*) as item()* { let $pos := head($markup) let $next := aho:start($match) return ( aho:end($match)+1, tail($markup), ( if ($next > $pos) then ( text { substring($text, $pos, $next - $pos) } ) else (), $markup-function( substring($text, aho:start($match), aho:interval-size($match)), $match, $dictionary("entities")=>array:get(aho:index($match)) ) ) ) } ) return ( tail($markup), text { substring($text, head($markup)) } ) default return $node }
Function: normalize
declare function normalize($matching as xs:string,
$match as map(xs:string,item()),
$entity as map(xs:string,xs:string)) as node()*
declare function normalize($matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string)) as node()*
normalize()
Replace the matching text with the normalized form of the entity.
Params
- matching as xs:string
- match as map(xs:string,item())
- entity as map(xs:string,xs:string)
Returns
- node()*
declare %art:markup function this:normalize( $matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string) ) as node()* { text {this:normalized($entity)} }
Function: remove
declare function remove($matching as xs:string,
$match as map(xs:string,item()),
$entity as map(xs:string,xs:string)) as node()*
declare function remove($matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string)) as node()*
remove()
Remove the matching text.
Params
- matching as xs:string
- match as map(xs:string,item())
- entity as map(xs:string,xs:string)
Returns
- node()*
declare %art:markup function this:remove( $matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string) ) as node()* { () }
Function: full-entity
declare function full-entity($matching as xs:string,
$match as map(xs:string,item()),
$entity as map(xs:string,xs:string)) as node()*
declare function full-entity($matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string)) as node()*
full-entity()
Wrap the matching text in entity markup. The entity element
will include attributes with the entity id, the normalized form, the
start position of the match (useful for extract()), and possibly a subtype.
The element name will be derived from the entity type. This function
assumes types are in the form type:subtype, e.g. event:surveillance, or
just plain type names.
Params
- matching as xs:string
- match as map(xs:string,item())
- entity as map(xs:string,xs:string)
Returns
- node()*
declare %art:markup function this:full-entity( $matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string) ) as node()* { let $type := if (contains(this:type($entity),":")) then substring-before(this:type($entity),":") else this:type($entity) let $subtype := if (contains(this:type($entity),":")) then substring-after(this:type($entity),":") else this:type($entity) return element {QName("http://mathling.com/entity", "e:"||this:safe-name($type))} { attribute id {this:id($entity)}, attribute norm {this:normalized($entity)}, if (exists($subtype)) then attribute subtype {$subtype} else (), attribute start {aho:start($match)}, $matching } }
Function: basic-entity
declare function basic-entity($matching as xs:string,
$match as map(xs:string,item()),
$entity as map(xs:string,xs:string)) as node()*
declare function basic-entity($matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string)) as node()*
basic-entity()
Wrap the matching text in entity markup. The entity element
will include an attribute with the entity id.
The element name will be derived from the entity type. This function
assumes types are in the form type:subtype, e.g. event:surveillance, or
just plain type names.
Params
- matching as xs:string
- match as map(xs:string,item())
- entity as map(xs:string,xs:string)
Returns
- node()*
declare %art:markup function this:basic-entity( $matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string) ) as node()* { let $type := if (contains(this:type($entity),":")) then substring-before(this:type($entity),":") else this:type($entity) return element {QName("http://mathling.com/entity", "e:"||this:safe-name($type))} { attribute id {this:id($entity)}, $matching } }
Function: dictionary-from-file
declare function dictionary-from-file($file as xs:string) as map(xs:string,item()*)
declare function dictionary-from-file($file as xs:string) as map(xs:string,item()*)
dictionary-from-file:
Construct an entity dictionary from a text file consisting of tab-delimited
lines. Each line is assumed to have the format:
id\tnormalized form\tmatch string\ttype
Returns the sequence of parsed entity items.
Params
- file as xs:string: input file
Returns
- map(xs:string,item()*)
declare function this:dictionary-from-file( $file as xs:string ) as map(xs:string,item()*) { this:dictionary(this:parse-entities(unparsed-text-lines($file))) }
Function: dictionary-from-file
declare function dictionary-from-file($file as xs:string,
$config as map(xs:string,item()*)) as map(xs:string,item()*)
declare function dictionary-from-file($file as xs:string, $config as map(xs:string,item()*)) as map(xs:string,item()*)
dictionary-from-file:
Construct an entity dictionary from a text file consisting of tab-delimited
lines. Each line is assumed to have the format:
id\tnormalized form\tmatch string\ttype
Returns the sequence of parsed entity items.
Params
- file as xs:string: input file
- config as map(xs:string,item()*): configuration options
Returns
- map(xs:string,item()*)
declare function this:dictionary-from-file( $file as xs:string, $config as map(xs:string,item()*) ) as map(xs:string,item()*) { this:dictionary(this:parse-entities(unparsed-text-lines($file)), $config) }
Function: dictionary-from-buffer
declare function dictionary-from-buffer($string as xs:string) as map(xs:string,item()*)
declare function dictionary-from-buffer($string as xs:string) as map(xs:string,item()*)
dictionary-from-buffer:
Construct an entity dictionary from a string buffer consisting of
tab-delimited lines. Each line is assumed to have the format:
id\tnormalized form\tmatch string\ttype
Returns the sequence of parsed entity items.
Params
- string as xs:string: input string
Returns
- map(xs:string,item()*)
declare function this:dictionary-from-buffer( $string as xs:string ) as map(xs:string,item()*) { this:dictionary(this:parse-entities(tokenize($string, "\r\n|\r|\n")[not(.="")])) }
Function: dictionary-from-buffer
declare function dictionary-from-buffer($string as xs:string,
$config as map(xs:string,item()*)) as map(xs:string,item()*)
declare function dictionary-from-buffer($string as xs:string, $config as map(xs:string,item()*)) as map(xs:string,item()*)
dictionary-from-buffer:
Construct an entity dictionary from a string buffer consisting of
tab-delimited lines. Each line is assumed to have the format:
id\tnormalized form\tmatch string\ttype
Returns the sequence of parsed entity items.
Params
- string as xs:string: input string
- config as map(xs:string,item()*): configuration options
Returns
- map(xs:string,item()*)
declare function this:dictionary-from-buffer( $string as xs:string, $config as map(xs:string,item()*) ) as map(xs:string,item()*) { this:dictionary(this:parse-entities(tokenize($string, "\r\n|\r|\n")[not(.="")]), $config) }
Function: save-compiled
declare function save-compiled($to as xs:string, $dictionary as map(xs:string,item()*)) as empty-sequence()
declare function save-compiled($to as xs:string, $dictionary as map(xs:string,item()*)) as empty-sequence()
save-compiled()
Save a compiled dictionary to a file. Requires Saxon-PE or Saxon-EE. If
you have Saxon-HE set query output method to text and use serialize()
directly.
Params
- to as xs:string: file to write to
- dictionary as map(xs:string,item()*): dictionary to save
Returns
declare function this:save-compiled($to as xs:string, $dictionary as map(xs:string,item()*)) as empty-sequence() { $this:SAVE-IMPL($to, serialize($dictionary, map {"method": "json"})) }
Function: load-compiled
declare function load-compiled($file as xs:string)
declare function load-compiled($file as xs:string)
load-compiled()
Load a compiled dictionary. (Save the work of computing the states for the
trie.)
Params
- file as xs:string: file containing the saved compiled dictionary
declare function this:load-compiled($file as xs:string) { let $raw := parse-json(unparsed-text($file)) return (: Fix up trie because integers got turned into doubles, alas :) $raw=>map:put("trie", aho:fix-trie($raw("trie"))) }
Original Source Code
xquery version "3.1"; (:~ : Simple entity matching. : Each item in the entity table consists of an identifier, a normalized string, : a match string, and a 'type'. Matching is based on the match string while : the index returned from matching can be used to look up the other information : from the table. There can be multiple items with the same ID, representing : different alternatives for the same unique concept. There can be multiple : items with the same match string, representing different concepts for the : same lemma. The type and ID can also be linked to semantic information : (e.g. RDF) to drive more advanced processing. : : The main markup() and extract() functions take a function to decide what : to make of the matching entity. Some sample functions are included. : : Copyright© Mary Holstege 2020-2023 : CC-BY (https://creativecommons.org/licenses/by/4.0/) : @since February 2021 :) module namespace this="http://mathling.com/string/entity"; import module namespace errors="http://mathling.com/core/errors" at "../core/errors.xqy"; import module namespace util="http://mathling.com/core/utilities" at "../core/utilities.xqy"; import module namespace aho="http://mathling.com/string/aho" at "../string/aho.xqy"; declare namespace art="http://mathling.com/art"; declare namespace map="http://www.w3.org/2005/xpath-functions/map"; declare namespace math="http://www.w3.org/2005/xpath-functions/math"; declare namespace array="http://www.w3.org/2005/xpath-functions/array"; (:~ Default config: case-sensitive :) declare variable $this:DEFAULT-CONFIG as map(xs:string,item()*) := map { "case-insensitive": false() } ; (:~ Default options no overlaps, no partial words :) declare variable $this:DEFAULT-OPTIONS as map(xs:string,item()*) := map { "only-whole-words": true() } ; (:======================================================================: : Public API :======================================================================:) (:======================================================================: : Entity :======================================================================:) (:~ : entity() : Constructor for entity item. : : @param $id: unique identifier : @param $normalized: normalized form of entity : @param $word: match string, a lemma : @param $type: type string :) declare function this:entity( $id as xs:string, $normalized as xs:string, $word as xs:string, $type as xs:string ) as map(xs:string,xs:string) { map { "id": $id, "normalized": $normalized, "word": $word, "type": $type } }; (:~ : id() : Accessor for unique ID. : : @param $entity: the entity :) declare function this:id($entity as map(xs:string, xs:string)) as xs:string { $entity("id") }; (:~ : normalized() : Accessor for normalized form of entity. : : @param $entity: the entity :) declare function this:normalized($entity as map(xs:string, xs:string)) as xs:string { $entity("normalized") }; (:~ : word() : Accessor for match string. : : @param $entity: the entity :) declare function this:word($entity as map(xs:string, xs:string)) as xs:string { $entity("word") }; (:~ : type() : Accessor for the type string. : : @param $entity: the entity :) declare function this:type($entity as map(xs:string, xs:string)) as xs:string { $entity("type") }; (:======================================================================: : Dictionary :======================================================================:) (:~ : dictionary() : Constructor for entity dictionary : : @param $entities: entity items in the table :) declare function this:dictionary( $entities as map(xs:string,xs:string)* ) as map(xs:string,item()*) { map { "trie": aho:trie( $this:DEFAULT-CONFIG, for $entity in $entities return this:word($entity) ), "entities": array { $entities } } }; (:~ : dictionary() : Constructor for entity dictionary : : @param $entities: entity items in the table : @param $config: configuration options :) declare function this:dictionary( $entities as map(xs:string,xs:string)*, $config as map(xs:string,item()*) ) as map(xs:string,item()*) { map { "trie": aho:trie( util:merge-into($this:DEFAULT-CONFIG, $config), for $entity in $entities return this:word($entity) ), "entities": array { $entities } } }; (:======================================================================: : Match and process :======================================================================:) (:~ : extract() : Extract all the entities from the text nodes in the input using the : default match options, and return them as handled by the extract function. : : @param $dictionary: the entity dictionary : @param $input: nodes to process : @param $extract-function: a function to process each match : The function is given the matching text from the input, the match itself, : and the corresponding entity from the entity table in the dictionary :) declare function this:extract( $dictionary as map(xs:string,item()*), $input as node()*, $extract-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()* ) as node()* { this:extract($dictionary, $input, $extract-function, map {}) }; (:~ : extract() : Extract all the entities from the text nodes in the input using the : given match options, and return them as handled by the extract function. : : @param $dictionary: the entity dictionary : @param $input: nodes to process : @param $extract-function: a function to process each match : The function is given the matching text from the input, the match itself, : and the corresponding entity from the entity table in the dictionary : @param $match-options: options to use for matching keywords :) declare function this:extract( $dictionary as map(xs:string,item()*), $input as node()*, $extract-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()*, $match-options as map(xs:string,item()*)? ) as node()* { let $options := util:merge-into($this:DEFAULT-OPTIONS, $match-options) for $text in $input/text() for $match in $dictionary("trie")=>aho:get-matches($text, $options) return ( $extract-function( substring($text, aho:start($match), aho:interval-size($match)), $match, $dictionary("entities")=>array:get(aho:index($match)) ) ) }; (:~ : markup() : Walk the input nodes and return them with all entity matches replaced : with the output of the extract function over the given match. Use the : default match options. : : @param $dictionary: the entity dictionary : @param $input: nodes to process : @param $markup-function: a function to process each match : The function is given the matching text from the input, the match itself, : and the corresponding entity from the entity table in the dictionary :) declare function this:markup( $dictionary as map(xs:string,item()*), $input as node()*, $markup-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()* ) as node()* { this:markup($dictionary, $input, $markup-function, map {}) }; (:~ : markup() : Walk the input nodes and return them with all entity matches replaced : with the output of the extract function over the given match. Use the : given match options. : : @param $input: nodes to process : @param $dictionary: the entity dictionary : @param $markup-function: a function to process each match : The function is given the matching text from the input, the match itself, : and the corresponding entity from the entity table in the dictionary : @param $match-options: options to use for matching keywords :) declare function this:markup( $dictionary as map(xs:string,item()*), $input as node()*, $markup-function as function((:text:)xs:string, (:match:)map(xs:string,item()), (:entity:)map(xs:string,xs:string)) as node()*, $match-options as map(xs:string,item()*)? ) as node()* { let $options := util:merge-into($this:DEFAULT-OPTIONS, $match-options) for $node in $input return typeswitch($node) case document-node() return document { this:markup($dictionary, $node/node(), $markup-function, $options) } case element() return element {node-name($node)} { this:markup($dictionary, $node/node(), $markup-function, $options) } case text() return let $text := string($node) let $matches := $dictionary("trie")=>aho:get-matches($text, $options) let $markup := fold-left( $matches, (1), function ($markup as item()*, $match as map(xs:string,item())*) as item()* { let $pos := head($markup) let $next := aho:start($match) return ( aho:end($match)+1, tail($markup), ( if ($next > $pos) then ( text { substring($text, $pos, $next - $pos) } ) else (), $markup-function( substring($text, aho:start($match), aho:interval-size($match)), $match, $dictionary("entities")=>array:get(aho:index($match)) ) ) ) } ) return ( tail($markup), text { substring($text, head($markup)) } ) default return $node }; (:======================================================================: : Some markup functions :======================================================================:) (:~ : normalize() : Replace the matching text with the normalized form of the entity. :) declare %art:markup function this:normalize( $matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string) ) as node()* { text {this:normalized($entity)} }; (:~ : remove() : Remove the matching text. :) declare %art:markup function this:remove( $matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string) ) as node()* { () }; (:~ : full-entity() : Wrap the matching text in entity markup. The entity element : will include attributes with the entity id, the normalized form, the : start position of the match (useful for extract()), and possibly a subtype. : The element name will be derived from the entity type. This function : assumes types are in the form type:subtype, e.g. event:surveillance, or : just plain type names. :) declare %art:markup function this:full-entity( $matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string) ) as node()* { let $type := if (contains(this:type($entity),":")) then substring-before(this:type($entity),":") else this:type($entity) let $subtype := if (contains(this:type($entity),":")) then substring-after(this:type($entity),":") else this:type($entity) return element {QName("http://mathling.com/entity", "e:"||this:safe-name($type))} { attribute id {this:id($entity)}, attribute norm {this:normalized($entity)}, if (exists($subtype)) then attribute subtype {$subtype} else (), attribute start {aho:start($match)}, $matching } }; (:~ : basic-entity() : Wrap the matching text in entity markup. The entity element : will include an attribute with the entity id. : The element name will be derived from the entity type. This function : assumes types are in the form type:subtype, e.g. event:surveillance, or : just plain type names. :) declare %art:markup function this:basic-entity( $matching as xs:string, $match as map(xs:string,item()), $entity as map(xs:string,xs:string) ) as node()* { let $type := if (contains(this:type($entity),":")) then substring-before(this:type($entity),":") else this:type($entity) return element {QName("http://mathling.com/entity", "e:"||this:safe-name($type))} { attribute id {this:id($entity)}, $matching } }; (:======================================================================: : Loading/storing entity dictionaries :======================================================================:) (:~ : dictionary-from-file: : Construct an entity dictionary from a text file consisting of tab-delimited : lines. Each line is assumed to have the format: : id\tnormalized form\tmatch string\ttype : Returns the sequence of parsed entity items. : : @param $file: input file :) declare function this:dictionary-from-file( $file as xs:string ) as map(xs:string,item()*) { this:dictionary(this:parse-entities(unparsed-text-lines($file))) }; (:~ : dictionary-from-file: : Construct an entity dictionary from a text file consisting of tab-delimited : lines. Each line is assumed to have the format: : id\tnormalized form\tmatch string\ttype : Returns the sequence of parsed entity items. : : @param $file: input file : @param $config: configuration options :) declare function this:dictionary-from-file( $file as xs:string, $config as map(xs:string,item()*) ) as map(xs:string,item()*) { this:dictionary(this:parse-entities(unparsed-text-lines($file)), $config) }; (:~ : dictionary-from-buffer: : Construct an entity dictionary from a string buffer consisting of : tab-delimited lines. Each line is assumed to have the format: : id\tnormalized form\tmatch string\ttype : Returns the sequence of parsed entity items. : : @param $string: input string :) declare function this:dictionary-from-buffer( $string as xs:string ) as map(xs:string,item()*) { this:dictionary(this:parse-entities(tokenize($string, "\r\n|\r|\n")[not(.="")])) }; (:~ : dictionary-from-buffer: : Construct an entity dictionary from a string buffer consisting of : tab-delimited lines. Each line is assumed to have the format: : id\tnormalized form\tmatch string\ttype : Returns the sequence of parsed entity items. : : @param $string: input string : @param $config: configuration options :) declare function this:dictionary-from-buffer( $string as xs:string, $config as map(xs:string,item()*) ) as map(xs:string,item()*) { this:dictionary(this:parse-entities(tokenize($string, "\r\n|\r|\n")[not(.="")]), $config) }; (:~ : parse-entities() : Parse entity items from sequence of tab-delimited lines. : Each line is assumed to have the format: : id\tnormalized form\tmatch string\ttype : Returns the sequence of parsed entity items. : : @param $lines: sequences of lines to parse :) declare %private function this:parse-entities( $lines as xs:string* ) as map(xs:string,xs:string)* { for $line in $lines where not(starts-with($line,"#")) return ( let $parms := tokenize($line, "\t+")[not(.="")] return this:entity($parms[1], $parms[2], $parms[3], $parms[4]) ) }; declare %private variable $this:SAVE-IMPL as function(xs:string,xs:string) as item()* := let $write-text := function-lookup(QName("http://expath.org/ns/file","write-text"), 2) return ( if (empty($write-text)) then ( function ($file as xs:string, $contents as xs:string) as empty-sequence() { errors:error("ML-UNAVAILABLE", "file:write-text") } ) else $write-text ) ; (:~ : save-compiled() : Save a compiled dictionary to a file. Requires Saxon-PE or Saxon-EE. If : you have Saxon-HE set query output method to text and use serialize() : directly. : : @param $to: file to write to : @param $dictionary: dictionary to save :) declare function this:save-compiled($to as xs:string, $dictionary as map(xs:string,item()*)) as empty-sequence() { $this:SAVE-IMPL($to, serialize($dictionary, map {"method": "json"})) }; (:~ : load-compiled() : Load a compiled dictionary. (Save the work of computing the states for the : trie.) : : @param $file: file containing the saved compiled dictionary :) declare function this:load-compiled($file as xs:string) { let $raw := parse-json(unparsed-text($file)) return (: Fix up trie because integers got turned into doubles, alas :) $raw=>map:put("trie", aho:fix-trie($raw("trie"))) }; (:======================================================================: : Internal :======================================================================:) declare %private function this:safe-name($string as xs:string) as xs:string { replace($string, "[^\p{L}]", "_") };