(: Copyright 2009 Cantus Foundation http://alpheios.net This file is part of Alpheios. Alpheios is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Alpheios is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . :) module namespace almt="http://alpheios.net/namespaces/alignment-match"; (: Find matches in two strings of words The output is a sequence of elements with the following attributes: type - "1-to-1" for a run of identical words in both sets of words "skip" for a run of words that appears in only one set of words "mismatch" for runs of non-matching words in both sets of words o1 - offset of start of run in first set of words o2 - offset of start of run in second set of words l1 - length of run in first set of words l2 - length of run in second set of words w1 - first word in run in first set of words w2 - first word in run in second set of words w - first word in run (type "1-to-1" only) If the type is "skip" then only one of {o1,l1,w1} and {o2,l2,w2} will appear, depending on which set of words the skipped text belongs to. If the type is "1-to-1" then w appears and w1/w2 are absent. Several static variables parametrize the algorithm: s_maxMissing max number of missing words to test for s_maxMismatch max number of mismatched words to test for s_match number of words to verify match The s_sync sequence holds the offsets to test for in searching for a synchronization point in the two sets of words. Attributes x and y are the offsets to sets in the first and second set, respectively. The sequence tests first for missing words of increasing length (0, 1), (1, 0), (0, 2), (2, 0), ..., (0, s_maxMissing), (s_maxMissing, 0) Then the sequence tests for mismatched texts. The test points are ordered by the increasing sum of x and y, where the difference between x and y is no more than s_maxMismatch. (1, 1), (0, 2), (1, 1), (2, 0), ..., (s_maxMismatch/2, 3*s_maxMismatch/2) s_match words must match after either a skip or mismatch in order to establish a new match. :) declare variable $almt:s_maxMissing := 100; declare variable $almt:s_maxMismatch := 10; declare variable $almt:s_match := 3; declare variable $almt:s_sync := ( (: test points for missing text :) for $i in (1 to $almt:s_maxMissing) return ( element pair { attribute x { 0 }, attribute y { $i } }, element pair { attribute x { $i }, attribute y { 0 } } ), (: test points for mismatched text :) almt:gen-sync-list(1, 1, $almt:s_maxMismatch) ); (: Recursive function to generate synchronization test points Parameters: $a_x test offset for first list $a_y test offset for second list $a_max maximum difference between offsets Return value: sequence of elements with attributes: x = offset for first list y = offset for second list :) declare function almt:gen-sync-list( $a_x as xs:integer, $a_y as xs:integer, $a_max as xs:integer) as element()* { ( (: put out current pair :) element pair { attribute x { $a_x }, attribute y { $a_y } }, (: if need to start new line :) if (($a_x = 1) or (($a_y - $a_x + 2) > $a_max)) then (: if at end of sequence :) if (($a_x + $a_y) = (2 * $a_max)) then () else let $newsum := $a_x + $a_y + 1 return if ($newsum > $a_max) then let $x := ($newsum + $a_max) idiv 2 return almt:gen-sync-list($x, $newsum - $x, $a_max) else almt:gen-sync-list($newsum - 1, 1, $a_max) else almt:gen-sync-list($a_x - 1, $a_y + 1, $a_max) ) }; (: Function to generate word sequence matches Parameters: $a_w1 first list of words $a_w2 second list of words $a_ignore whether to do case-insensitive compares Return value: sequence of elements with attributes: w = starting word w1 = starting word in w1 list w2 = starting word in w2 list o1 = starting offset in w1 list o2 = starting offset in w2 list l1 = length of l1 matching sequence l2 = length of l2 matching sequence :) declare function almt:match( $a_w1 as xs:string*, $a_w2 as xs:string*, $a_ignore as xs:boolean) as element()* { almt:do-match($a_w1, $a_w2, $a_ignore, 1, 1) }; (: Recursive function to generate word sequence matches Parameters: $a_w1 first list of words $a_w2 second list of words $a_ignore whether to do case-insensitive compares $a_o1 offset in first list $a_o2 offset in second list Return value: sequence of elements :) declare function almt:do-match( $a_w1 as xs:string*, $a_w2 as xs:string*, $a_ignore as xs:boolean, $a_o1 as xs:integer, $a_o2 as xs:integer) as element()* { (: if both lists are empty, nothing left to do :) if (($a_o1 > count($a_w1)) and ($a_o2) > count($a_w2)) then () else (: find last match :) let $len := min((count($a_w1) - $a_o1, count($a_w2) - $a_o2)) let $firstMisMatch := ( for $i in (0 to $len) let $differ := if ($a_ignore) then if (lower-case($a_w1[$a_o1 + $i]) = lower-case($a_w2[$a_o2 + $i])) then false() else true() else if ($a_w1[$a_o1 + $i] = $a_w2[$a_o2 + $i]) then false() else true() where $differ return $i )[1] let $lastMatch := if (exists($firstMisMatch)) then $firstMisMatch else $len + 1 return ( (: initial matching sequence was found :) if ($lastMatch > 0) then (: include matched words :) element match { attribute type { "1-to-1" }, attribute o1 { $a_o1 }, attribute o2 { $a_o2 }, attribute l1 { $lastMatch }, attribute l2 { $lastMatch }, attribute w { $a_w1[$a_o1] } } else (), (: skip match and find next sync point :) let $sync := almt:sync($a_w1, $a_w2, $a_ignore, $a_o1 + $lastMatch, $a_o2 + $lastMatch, 1) return if ($sync) then ( (: return words up to sync point :) element match { attribute type { if (($sync/@l1 = 0) or ($sync/@l2 = 0)) then "skip" else "mismatch" }, if ($sync/@l1 > 0) then ( attribute o1 { $a_o1 + $lastMatch }, attribute l1 { $sync/@l1 }, attribute w1 { $a_w1[$a_o1 + $lastMatch] } ) else (), if ($sync/@l2 > 0) then ( attribute o2 { $a_o2 + $lastMatch }, attribute l2 { $sync/@l2 }, attribute w2 { $a_w2[$a_o2 + $lastMatch] } ) else () }, (: continue matching from sync point :) almt:do-match($a_w1, $a_w2, $a_ignore, xs:integer($a_o1 + $lastMatch + $sync/@l1), xs:integer($a_o2 + $lastMatch + $sync/@l2)) ) else let $o1 := $a_o1 + $lastMatch let $o2 := $a_o2 + $lastMatch return if (($o1 <= count($a_w1)) or ($o2 <= count($a_w2))) then element oops { attribute w1 { $a_w1[$o1] }, attribute w2 { $a_w2[$o2] }, attribute o1 { $o1 }, attribute o2 { $o2 } } else () ) }; (: Recursive function to find synchronization point Parameters: $a_w1 first list of words $a_w2 second list of words $a_ignore whether to do case-insensitive compares $a_o1 offset in first list $a_o2 offset in second list $a_isync index into synchronization test point sequence Return value: element with attributes: l1 = length of l1 sequence l2 = length of l2 sequence () if empty input or no synchronization point is found :) declare function almt:sync( $a_w1 as xs:string*, $a_w2 as xs:string*, $a_ignore as xs:boolean, $a_o1 as xs:integer, $a_o2 as xs:integer, $a_isync as xs:integer) as element()? { (: if first list is empty :) if ($a_o1 > count($a_w1)) then if ($a_o2 <= count($a_w2)) then element sync { attribute l1 { 0 }, attribute l2 { count($a_w2) - $a_o2 + 1 } } else () else if ($a_o2 > count($a_w2)) then element sync { attribute l1 { count($a_w1) - $a_o1 + 1 }, attribute l2 { 0 } } (: if no more sync test points, we failed :) else if ($a_isync > count($almt:s_sync)) then () else let $so1 := $almt:s_sync[$a_isync]/@x let $so2 := $almt:s_sync[$a_isync]/@y (: count matches :) let $nmatches := sum(for $i in (0 to $almt:s_match - 1) return if ($a_ignore) then if (lower-case($a_w1[$a_o1 + $so1 + $i]) = lower-case($a_w2[$a_o2 + $so2 + $i])) then 1 else 0 else if ($a_w1[$a_o1 + $so1 + $i] = $a_w2[$a_o2 + $so2 + $i]) then 1 else 0 ) return (: if all words match :) if ($nmatches = $almt:s_match) then (: return result :) element sync { attribute l1 { $so1 }, attribute l2 { $so2 } } else (: try next sync point :) almt:sync($a_w1, $a_w2, $a_ignore, $a_o1, $a_o2, $a_isync + 1) };