(:
Copyright 2012 The Alpheios Project, Ltd.
http://alpheios.net
This file is part of Alpheios.
Alpheios is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Alpheios is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
:)
(:
Mark words in text with elements
:)
import module namespace almt="http://alpheios.net/namespaces/alignment-match"
at "alignment-match.xquery";
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare namespace align="http://alpheios.net/namespaces/aligned-text";
declare variable $e_work as xs:string external;
declare variable $e_edition as xs:string external;
declare variable $e_translation as xs:string external;
declare variable $e_transLang as xs:string external;
declare variable $e_srcLang as xs:string external;
declare variable $e_reverse as xs:boolean external;
declare variable $e_path as xs:string external;
declare variable $e_includePunc as xs:boolean external;
declare variable $e_docname as xs:string :=
if ($e_reverse) then concat($e_work,'.',$e_translation,'.xml') else concat($e_work,'.',$e_edition,'.xml');
declare variable $e_align as xs:string := concat($e_work,'.',$e_edition,'.',$e_translation,'.align.xml');
declare variable $e_treebank as xs:string := concat($e_work,'.',$e_edition,'.tb.xml');
declare variable $e_transDoc as xs:string :=
if ($e_reverse) then concat($e_work,'.',$e_edition,'.xml') else concat($e_work,'.',$e_translation,'.xml');
declare variable $e_lang as xs:string :=
if ($e_reverse) then $e_transLang else $e_srcLang;
(: sets of characters by language :)
(: non-text characters :)
declare variable $s_nontext :=
(
element nontext
{
attribute lang { "grc" },
" “”—"‘’,.:;··?!\[\]{}\-"
},
element nontext
{
attribute lang { "ara" },
" “”—"‘’,.:;?!\[\]{}\-،؍"
},
element nontext
{
attribute lang { "*" },
" “”—"‘’,.:;··?!\[\](){}\-"
}
);
(: characters which signify word break and are part of word :)
declare variable $s_breaktext :=
(
element breaktext
{
attribute lang { "grc" },
"᾽"
},
element breaktext
{
attribute lang { "ara" },
"᾽"
},
element breaktext
{
attribute lang { "*" },
"᾽"
}
);
declare variable $s_linenum :=
(
element breaktext
{
attribute lang { "*" },
"1234567890"
}
);
declare variable $s_tbPunc :=
(
element punc
{
attribute lang { "*" },
",.:;\-—"
}
);
(:
Process set of nodes
:)
declare function local:process-nodes(
$a_nodes as node()*,
$a_in-text as xs:boolean,
$a_id as xs:string,
$a_match-text as xs:string,
$a_match-nontext as xs:string,
$a_match-linenum as xs:string,
$a_match-punc as xs:string) as node()*
{
(: for each node :)
for $node at $i in $a_nodes
return
typeswitch ($node)
(:
if element, copy and process all child nodes
:)
case element()
return
element { QName(namespace-uri($node),name($node)) }
{
local:process-nodes(
$node/(node()|@*),
($a_in-text or (local-name($node) eq "body"))
and not(local-name($node) = ("note", "head")),
concat($a_id, "-", $i),
$a_match-text,
$a_match-nontext,
$a_match-linenum,
$a_match-punc)
}
(: if text in body, process it else just copy it :)
case $t as text()
return
if ($a_in-text)
then
local:process-text(normalize-space($t),
concat($a_id, "-", $i),
1,
$a_match-text,
$a_match-nontext,
$a_match-linenum,
$a_match-punc)
else
$node
(: otherwise, just copy it :)
default
return $node
};
(:
Process text string in body
:)
declare function local:process-text(
$a_text as xs:string,
$a_id as xs:string,
$a_i as xs:integer,
$a_match-text as xs:string,
$a_match-nontext as xs:string,
$a_match-linenum as xs:string,
$a_match-punc as xs:string) as node()*
{
(: if anything to process :)
if (string-length($a_text) > 0)
then
(: see if it starts with text :)
let $is-text := matches($a_text, $a_match-text)
(: see if it starts with text :)
let $is-punc := matches($a_text, $a_match-punc)
(: see if its a line number:)
let $is-linenum := matches($a_text, $a_match-linenum)
(: get initial text/non-text string :)
let $t := replace($a_text,
if ($is-text) then $a_match-text else if ($is-punc) then $a_match-punc else $a_match-nontext,
"$1")
return
(
(: return w element with text or non-text string :)
if ($is-text)
then
element {QName("http://www.tei-c.org/ns/1.0","w")}
{
(: assign unique id to word :)
attribute xml:id { concat($a_id, "-", $a_i) },
$t
}
else if ($is-linenum)
then
element {QName("http://www.tei-c.org/ns/1.0","milestone")}
{
attribute unit { "Line"},
attribute n { $t }
}
else if ($is-punc and ($e_includePunc = true()))
then
element {QName("http://www.tei-c.org/ns/1.0","w")}
{
(: assign unique id to word :)
attribute xml:id { concat($a_id, "-", $a_i) },
$t
}
else
text { $t },
(: then recursively process rest of text :)
local:process-text(substring-after($a_text, $t),
$a_id,
$a_i + 1,
$a_match-text,
$a_match-nontext,
$a_match-linenum,
$a_match-punc)
)
else ()
};
(:
Print a set of aligned nodes
Parameters:
$a_nodes set of nodes to process
$a_fixedWords set of adjusted w nodes
Return value:
sequence of adjusted nodes
Each in the original text has a unique id attribute.
Each fixed is wrapped in a element with the
matching id. The fixed 's do not have id attributes,
since these are needed only for the purpose of efficiently
matching original with fixed elements.
:)
declare function local:print-nodes(
$a_nodes as node()*,
$a_fixedWords as element(wrap)*) as node()*
{
(: for each node :)
for $node in $a_nodes
return
typeswitch ($node)
(: if w, replace with corresponding fixed word, if it exists :)
case $word as element(tei:w)
return
if ($a_fixedWords[@xml:id = $word/@xml:id]/tei:w)
then
let $fixed := $a_fixedWords[@xml:id = $word/@xml:id]/tei:w
return
element {QName("http://www.tei-c.org/ns/1.0","w")} {
$fixed/@*[name(.) != 'nrefs' and name(.) != 'tbrefs' and local-name(.) != 'id'],
if ($fixed/@nrefs) then
attribute corresp {
if (contains($fixed/@nrefs, ' ')) then
string-join (
for $t in tokenize(normalize-space($fixed/@nrefs),' ')
return concat($e_transDoc,"#xpointer(//*[@n='", $t, "'])"), ' ')
else
concat($e_transDoc,"#xpointer(//*[@n='", $fixed/@nrefs, "'])")
} else (),
if ($fixed/@tbrefs) then
attribute ana {
string($fixed/@tbrefs)
(:
if (contains($fixed/@tbrefs, ' ')) then
string-join (
for $t in tokenize(normalize-space($fixed/@tbrefs),' ')
let $parts := tokenize($t,'-')
return concat($e_treebank,"#xpointer(//sentence[@id='", $parts[1],"']/word[@id='", $parts[2], "'])"), ' ')
else
let $parts := tokenize($fixed/@tbrefs,'-')
return concat($e_treebank,"#xpointer(//sentence[@id='", $parts[1],"']/word[@id='", $parts[2], "'])")
:)
}
else (),
$fixed/*,
$fixed/text()
}
else $node
(:
if element, copy and process all child nodes
:)
case element()
return
element { QName(namespace-uri($node),name($node)) }
{
local:print-nodes(
$node/(node()|@*),
$a_fixedWords)
}
(: otherwise, just copy it :)
default
return $node
};
(:
Fix a set of text words
Parameters:
$a_textWords set of text words
$a_dataWords set of data words (either alignment or treebank)
$a_matches match info on sets of words
$a_treebank whether data is treebank or alignment
Return value:
sequence of text words with appropriate attributes added from data
:)
declare function local:fix-words(
$a_textWords as element(wrap)*,
$a_dataWords as element(tei:w)*,
$a_matches as element()*,
$a_treebank as xs:boolean) as element(wrap)*
{
(: nothing to do if no text words left :)
if (count($a_textWords) eq 0)
then ()
else
(: if no matches left, copy text words :)
if (count($a_matches) eq 0)
then $a_textWords
else
if ($a_matches/*:oops)
then $a_textWords
else
(: create words for this match :)
let $match := $a_matches[1]
let $newWords :=
(: if 1-to-1 match :)
if ($match/@type eq "1-to-1")
then
for $i in (1 to $match/@l1)
return
local:fix-word($a_textWords[$i],
$a_dataWords[$i]/@n,
$a_dataWords[$i]/@nrefs,
$a_treebank)
(: if mismatch :)
else if ($match/@type eq "mismatch")
then
let $n :=
string-join(
for $j in (1 to $match/@l2)
return $a_dataWords[$j]/@n,
' ')
let $nrefs :=
string-join(
for $j in (1 to $match/@l2)
return $a_dataWords[$j]/@nrefs,
' ')
for $i in (1 to $a_matches[1]/@l1)
return
local:fix-word($a_textWords[$i], $n, $nrefs, $a_treebank)
(: if skip :)
else
subsequence($a_textWords, 1, if ($match/@l1) then $match/@l1 else 0)
return
(
$newWords,
local:fix-words(
subsequence($a_textWords, if ($match/@l1) then $match/@l1 + 1 else 1),
subsequence($a_dataWords, if ($match/@l2) then $match/@l2 + 1 else 1),
subsequence($a_matches, 2),
$a_treebank)
)
};
(:
Fix a single text word
Parameters:
$a_textWord text word to fix
$a_dataWord data word (either alignment or treebank)
$a_treebank whether data is treebank or alignment
Return value:
text word with appropriate attributes added from data
:)
declare function local:fix-word(
$a_textWord as element(wrap)?,
$a_n as xs:string?,
$a_nrefs as xs:string?,
$a_treebank as xs:boolean) as element(wrap)?
{
(: if no textword, nothing to do :)
if (not($a_textWord)) then () else
(: if no data, just copy textword :)
if (not($a_n)) then $a_textWord else
(: wrapper to hold id :)
element wrap
{
(: preserve original word id :)
$a_textWord/@xml:id,
(: new w element :)
element {QName("http://www.tei-c.org/ns/1.0","w")}
{
(: preserve any existing attributes :)
$a_textWord/tei:w/@*,
(: if this is treebank data :)
if ($a_treebank)
then
(: create tb ref attribute :)
attribute tbrefs { $a_n }
else
(
(: copy word id and alignment refs :)
attribute n { $a_n },
if (exists($a_nrefs)) then attribute nrefs {$a_nrefs } else ()
),
(: content is original word :)
$a_textWord/*:w/text()
}
}
};
let $doc := doc(concat($e_path,$e_docname))
let $nontext :=
if ($s_nontext[@lang eq $e_lang])
then
$s_nontext[@lang eq $e_lang]/text()
else
$s_nontext[@lang eq "*"]/text()
let $breaktext :=
if ($s_breaktext[@lang eq $e_lang])
then
$s_breaktext[@lang eq $e_lang]/text()
else
$s_breaktext[@lang eq "*"]/text()
let $linenum :=
$s_linenum[@lang eq "*"]/text()
let $punc :=
$s_tbPunc[@lang eq "*"]/text()
let $match-text :=
concat("^([^", $nontext, $breaktext, $linenum, "]+",
if ($breaktext) then concat("[", $breaktext, "]?") else (),
").*")
let $match-nontext := concat("^([", $nontext, $linenum, "]+).*")
let $match-linenum := concat("^([", $linenum, "]+).*")
let $match-punc := concat("^([", $punc, "]+).*")
let $marked-words :=
local:process-nodes($doc/node(), false(), "w1", $match-text, $match-nontext, $match-linenum, $match-punc)
(: END MARK WORDS :)
let $alignDoc := concat($e_path,'/',$e_align)
let $tbDoc := concat($e_path,'/',$e_treebank)
(: get words from original text :)
return
if (doc-available($alignDoc) or doc-available($tbDoc))
then
(: speakers aren't treebanked so ignore words inside speaker tags :)
let $textWords :=
for $word in
$marked-words//tei:w[not(parent::tei:speaker) and not(parent::tei:label) and not(ancestor::tei:docAuthor) ]
return
element wrap
{
$word/@xml:id,
element {QName("http://www.tei-c.org/ns/1.0","w")} {
if ($word/@nrefs or $word/@tbrefs)
then ($word/@nrefs,$word/@tbrefs)
else $word/@*,
$word/text()
}
}
(: get words from aligned text, ignoring non-text :)
let $alignLnum :=
if (doc-available($alignDoc))
then doc($alignDoc)//align:language[@xml:lang = $e_lang]/@lnum
else ()
let $alignWords :=
if (doc-available($alignDoc))
then
for $word in doc($alignDoc)//align:wds[@lnum=$alignLnum]/align:w
where not(matches($word/*:text, $match-nontext)) or ($e_includePunc = true() and matches($word/*:text,$match-punc))
return
element {QName("http://www.tei-c.org/ns/1.0","w")}
{
$word/@n,
$word/align:refs/@nrefs,
$word/align:text/text()
}
else ()
(: get words from treebank, ignoring non-text :)
let $tbWords :=
if (doc-available($tbDoc))
then
for $word in doc($tbDoc)//*:word
where not(matches($word/@form, $match-nontext)) or ($e_includePunc = true() and matches($word/@form,$match-punc))
return
element {QName("http://www.tei-c.org/ns/1.0","w")}
{
(: if sentence is valid :)
if (exists($word/../@id))
then
(: build name from sentence# and word# :)
attribute n
{
(: concat($word/../@id, "-", $word/@id) :)
$word/@relation
}
else (),
data($word/@form)
}
else ()
(: create fixed words to replace original :)
let $fix1 :=
local:fix-words($textWords,
$alignWords,
almt:match(data($textWords/tei:w), data($alignWords), true()),
false())
let $fix2 :=
if (doc-available($tbDoc))
then
local:fix-words(if ($fix1) then $fix1 else $textWords,
$tbWords,
almt:match(data($textWords/tei:w), data($tbWords), true()),
true())
else $fix1
return
(: create copy of original text with fixed words :)
element {QName("http://www.tei-c.org/ns/1.0","TEI")} {
$doc/tei:TEI/@*,
local:print-nodes($marked-words/node(), $fix2)
}
else $marked-words