Commit 0b2640cb authored by Ulrich's avatar Ulrich

eval

parent ea14b646
......@@ -31,6 +31,9 @@ path = "examples/scoring.rs"
name = "harvesting"
path = "examples/harvesting.rs"
[[bin]]
name = "eval"
path = "examples/eval.rs"
[dependencies]
......
extern crate llamapun;
extern crate libxml;
extern crate senna;
extern crate spotter_lib;
extern crate threadpool;
use std::env;
use spotter_lib::util::*;
use spotter_lib::data::*;
use libxml::tree::*;
use libxml::parser::Parser;
use llamapun::data::Corpus;
use llamapun::data::Document as lDoc;
use llamapun::pattern_example_adaption::get_declarations;
use threadpool::ThreadPool;
use std::sync::mpsc::channel;
use std::time::Duration;
/* This adds scores to ambiguities of an annotation. The higher the score, the more likely the annotation. */
pub fn main() {
let args : Vec<_> = env::args().collect();
if args.len() <= 2{
println!("usage: cargo run --bin scoring DocumentPath AnnotationPath");
return;
}
let config = parse_config("config.txt");
let input_path = args[2].to_owned();
let document_path = args[1].to_owned();
// let pool = ThreadPool::new(CORES);
let (tx, rx) = channel();
let mut mut_corpus = Corpus::new(document_path.clone());
let mut documents_vec = Vec::new();
for my_document in mut_corpus.iter(){
let my_tx = tx.clone();
let my_config = config.clone();
let my_document_path = my_document.path.clone();
let my_annotation_document = my_document.path.replace(&document_path, &input_path).replace(".html",".kat.xml");
documents_vec.push(my_document.path.clone());
// pool.execute(move || {
read_document(my_document_path.clone(), &my_annotation_document, &my_config);
my_tx.send(my_document_path.clone()).unwrap();
// });
}
let number_of_documents = documents_vec.len();
let mut successful_documents = vec![];
let timeout = Duration::from_secs(200);
for i in 0..number_of_documents{
let res = rx.recv_timeout(timeout);
if res.is_ok(){
let succ_doc = res.unwrap();
successful_documents.push(succ_doc.clone());
println!("finished ({}/{}) {}", (i+1), number_of_documents, succ_doc);
}else{
println!("timeout");
}
}
let mut doc_panic = vec![];
for document in documents_vec{
if !successful_documents.contains(&document){
doc_panic.push(document);
}
}
for document in doc_panic{
println!("Panic at {}", document);
}
}
pub fn read_document(document_path : String, annotation_location : &str, config : &Config){
let parser = Parser::default();
let opt_doc = parser.parse_file(annotation_location);
if opt_doc.is_err(){
println!("Cannot parse file {}", annotation_location);
return;
}
let annotation_document = opt_doc.unwrap();
let corpus = Corpus::new(document_path.clone());
let document = corpus.load_doc(document_path.clone()).unwrap();
let mut root = annotation_document.get_root_element();
find_content(&mut root, &annotation_document, &document, config);
}
fn find_content(node : &mut Node, annotation_document : &Document, original_document : &lDoc, config : &Config) {
let mut cnml = false;
match node.get_type().unwrap() {
NodeType::ElementNode => {
if node.get_name().eq("Description"){
cnml = true;
let mut c = node.get_first_child();
let first_child = node.get_first_child();
let mut counter = 0;
//collect all quantity expressions
while let Some(child) = c{
if child.get_name().eq("contentmathml"){
let opt_cnml_child = child.get_first_child();
if let Some(cnml_child) = opt_cnml_child{
let opt_qe = parse_cnml(&cnml_child, annotation_document);
counter = counter + 1;
}
}
c = child.get_next_sibling();
}
println!("Meanings {}", counter);
}
}
_ => { }
}
if !cnml {
let mut c: Option<Node> = node.get_first_child();
while let Some(mut child) = c {
find_content(&mut child, annotation_document, original_document, config);
c = child.get_next_sibling();
}
}
}
pub fn parse_cnml(node : &Node, document : &Document) -> Option<SpottedQE>{
if node.get_name().eq("apply"){
let first_child = node.get_first_child().unwrap();
let second_child = first_child.get_next_sibling().unwrap();
let opt_num = Some(document.node_to_string(&second_child));
let third_child = second_child.get_next_sibling().unwrap();
let (mul, div) = parse_units(&third_child, document);
let mut qe = SpottedQE::new(opt_num);
qe.mul_units = mul;
qe.div_units = div;
return Some(qe);
}
return None;
}
pub fn parse_units(node : &Node, document : &Document) -> (Vec<Unit>, Vec<Unit>) {
let mut mul_units = Vec::new();
let mut div_units = Vec::new();
if node.get_name().eq("apply") {
let mut first_child = node.get_first_child().unwrap();
if first_child.get_name().eq("times") {
while let Some(sib) = first_child.get_next_sibling() {
let (mut m, mut d) = parse_units(&sib, document);
mul_units.append(&mut m);
div_units.append(&mut d);
first_child = sib;
}
} else if first_child.get_name().eq("divide") {
let second_child = first_child.get_next_sibling().unwrap();
let third_child = second_child.get_next_sibling().unwrap();
let (mut m2, _) = parse_units(&second_child, document);
let (mut m3, _) = parse_units(&third_child, document);
mul_units.append(&mut m2);
div_units.append(&mut m3);
} else if first_child.get_name().eq("power") {
// first_child.get_name().eq("csymbol") && first_child.get_all_content().eq("superscript"){
let second_child = first_child.get_next_sibling().unwrap();
let third_child = second_child.get_next_sibling().unwrap();
let (m, _) = parse_units(&second_child, document);
let exp = document.node_to_string(&third_child);
let first = m[0].clone();
mul_units.push(Unit::new(first.opt_prefix_sym.unwrap_or(String::new()), first.unit_sym, exp));
} else if first_child.get_name().eq("csymbol") && get_all_content(&first_child).eq("Prefix") {
let second_child = first_child.get_next_sibling().unwrap();
let third_child = second_child.get_next_sibling().unwrap();
mul_units.push(Unit::new(get_all_content(&second_child), get_all_content(&third_child), "1".to_string()));
}
}
if node.get_name().eq("csymbol") {
mul_units.push(Unit::new(String::new(), get_all_content(&node), "1".to_string()));
}
return (mul_units, div_units);
}
In this section, we introduce the necessary technologies for this work. They include the markup language
In this section, we introduce the technologies necessary for the thesis. They include the markup language
MathML (Section \ref{ssec:mathml}), LaTeXML -- the converter from LaTeX to XML -- and the arXMLiv corpus (Section~\ref{ssec:latexml}),
the annotation tool KAT (Section~\ref{ssec:kat}) and the unit converter from Astropy (Section~\ref{ssec:astropy}).
We assume basic knowledge of LaTeX and HTML.
......@@ -18,10 +18,12 @@ We assume basic knowledge of LaTeX and HTML.
The \inline{semantics} child is an container whose first child is usually an
expression in presentation MathML, followed by annotations for this element.
An annotation can be in XML, for instance content MathML, or it can be any
sequence of characters, for example LaTeX code.
sequence of characters, for example LaTeX code.
Annotations can also be omitted. In this case, we can also omit the
\inline{semantics} child and write presentation MathML directly as a
child of the \inline{math} node.
child of the \inline{math} node.
We use angle brackets to denote meta variables in XML like \meta{Presentation MathML},
\meta{Content MathML} and \meta{LaTeX} in this case.
We briefly introduce
presentation and content MathML and point to~\cite{Miner:14:MML} for details.
......
There are various different ways of expressing the same quantity expression, even when it consists
of the same units. Compare, for instance, \quote{5 m}, \quote{5 meter} and \quote{five meter}, which are all equivalent
of the same units. Compare, for instance, \mq{5 m}, \mq{5 meter} and \mq{five meter}, which are all equivalent
ways of stating the exactly same information.
The documents, we are working with, were originally written in \LaTeX\ which also offers a wide variety of
possibilities to typeset the same expression. Thus the goal of this section is to present concrete examples of
......@@ -19,8 +19,8 @@ end of this section.
We distinguish quantity expressions from a syntactic and a semantic point
of view and for instance have a category containing relatively plain
expressions with units without superscripts (i.e.
\quote{$1\;\rm m$}; syntactically) and a category containing range expressions
for arbitrary units (i.e. \quote{$1 - 2\;\rm kg$}; semantically).
\mq{$1\;\rm m$}; syntactically) and a category containing range expressions
for arbitrary units (i.e. \mq{$1 - 2\;\rm kg$}; semantically).
This intentionally does not lead to a strictly disjoint categorization,
as the examples demonstrate.
For every example, we show the presentational part of the quantity expressions as well as
......@@ -35,35 +35,35 @@ end of this section.
We regard a quantity
expression as simple, if it contains any kind of numeric expression
followed by one or more unit symbols in a multiplicative way (i.e.
\quote{$3\;\text{Nm}$}). This excludes unit symbols, which are in superscript, like
\quote{$30^\circ$}, and we also exclude written-out unit symbols (i.e. meter) and
\mq{$3\;\text{Nm}$}). This excludes unit symbols, which are in superscript, like
\mq{$30^\circ$}, and we also exclude written-out unit symbols (i.e. meter) and
textual numbers for this class. \input{tex/tables/complexcat.tex}
Instead there is a separate category for textual unit symbols and
quantity expressions with textual numbers (Table~\ref{tab:textualcat}).
In addition to the first category, we introduce one for \textit{simple
divisive quantity expressions} (Table~\ref{tab:simpdivcat}), which differs
only by the fact that units may also occur in a divisive way (i.e.
\quote{$4\;\text{m/s}$}). \input{tex/tables/superscriptcat.tex}
\mq{$4\;\text{m/s}$}). \input{tex/tables/superscriptcat.tex}
We extend the simple multiplicative and divisive quantity expressions
to the category for \textit{complex quantity expressions}
(Table~\ref{tab:complexcat}) which contains expressions from the former
categories, but allows additional superscripts for units (i.e.
\quote{$5\;\text{m}^2$} or \quote{$5\;\text{m/s}^2$}). This category subsumes the first two.
\mq{$5\;\text{m}^2$} or \mq{$5\;\text{m/s}^2$}). This category subsumes the first two.
\input{tex/tables/textualcat.tex}
Additionally, we have a category for quantity expressions where unit symbols
are part of the superscript -- as in \quote{$30^\circ$}. Although written in their own
are part of the superscript -- as in \mq{$30^\circ$}. Although written in their own
manner, these expressions are, from a semantic point of view,
closely related to the simple multiplicative units. \input{tex/tables/rangecat.tex}
Furthermore, there are extra classes for \textit{range expressions}
(Table~\ref{tab:rangecat}) and \textit{unit products}
(Table~\ref{tab:unitprodcat}) because they include the previous expression, but contain additional semantic
information. \input{tex/tables/unitprodcat.tex}
For instance \quote{$23\;\mu\text{m} \times 23\;\mu\text{m}$}
describes not only an area of size \quote{$529\;\mu\text{m}^2$}, but also contains the information
that the area is quadratic and has an edge length of \quote{$23\;\mu\text{m}$}.
For instance \mq{$23\;\mu\text{m} \times 23\;\mu\text{m}$}
describes not only an area of size \mq{$529\;\mu\text{m}^2$}, but also contains the information
that the area is quadratic and has an edge length of \mq{$23\;\mu\text{m}$}.
Additional information is necessary to handle quantity expressions involving constants (Table~\ref{tab:constantcat}).
For instance, we need to know that \memph{$\Omega_{a}$ is the ratio of the axion energy density to the
critical density in the Universe}~\cite{hep-ph/9807232} and that \quote{$h$} is the Hubble constant to understand
critical density in the Universe}~\cite{hep-ph/9807232} and that \mq{$h$} is the Hubble constant to understand
the meaning of Example 2 of
this table. \input{tex/tables/constantcat.tex}
The examples in Table~\ref{tab:onlyunitcat} do not describe quantity expressions, but
......@@ -118,30 +118,36 @@ end of this section.
At first, we look at the presentational part and then at the corresponding LaTeX source.
From the rendered result, we observe that unit strings can be ambiguous.
For instance \quote{GHz} is very likely to stand for \quote{Gigahertz}, but could also denote
\quote{Gauß\footnote{Gauß is a unit of magnetic induction, commonly
abbreviated as \quote{G}.}~$\cdot$~Hertz}. Similarly, \quote{Pa} has two possible meanings --
\quote{Petayear} and \quote{Pascal\footnote{Pascal is a unit of pressure, commonly abbreviated as \quote{Pa}.}}.
For instance \mq{GHz} is very likely to stand for \mq{Gigahertz}, but could also denote
\mq{Gauß\footnote{Gauß is a unit of magnetic induction, commonly
abbreviated as \mq{G}.}~$\cdot$~Hertz}. Similarly, \mq{Pa} has two possible meanings --
\mq{Petayear} and \mq{Pascal\footnote{Pascal is a unit of pressure, commonly abbreviated as \mq{Pa}.}}.
Additionally, it is not always clear to which part of the expression exponents refer to.
In Example 4 from Table~\ref{tab:complexcat}, \quote{$0.5$ eV/Å${}^{3}$}, the exponent might refer
In Example 4 from Table~\ref{tab:complexcat}, \mq{$0.5$ eV/Å${}^{3}$}, the exponent might refer
only to Ångström\footnote{Ångström is a unit of length ($1 \; \rm \text{\AA} = 10^{-10} \; m$).}.,
but also to the whole expression. Hence we have the two possible meanings
\quote{$0.5 \; \text{eV/(Å}^3\text{)}$} and \quote{$0.5 \; \text{(eV/Å)}^3$}. Given the context of the paper,
\mq{$0.5 \; \text{eV/(Å}^3\text{)}$} and \mq{$0.5 \; \text{(eV/Å)}^3$}. Given the context of the paper,
we see that the former was meant in this case.
In the rendered result, the changes between text and math mode do not complicate the understanding of the
expressions for humans.
But when looking at the LaTeX source, we observe that this add a lot of noise to the data.
People tend to misuse text mode to ensure an upright font for units, instead of using the
\verb|\rm| command. This leads to somewhat surprising encodings like \\
``\verb|$100$| km s\verb|${}^{-1}$| Mpc \verb|${}^{-1}$|'', \\
``10\verb|$^{-4}$|M\verb|$_\odot$|'' and especially \\
``3.72 \verb|$\times$| 10 \verb|${}^{10}$| cm \verb|${}^{-2}$|''. \\
\verb|\rm| command. This leads to somewhat surprising encodings like
\begin{quote}
``\verb|$100$| km s\verb|${}^{-1}$| Mpc \verb|${}^{-1}$|'',
\end{quote}
\begin{quote}
``10\verb|$^{-4}$|M\verb|$_\odot$|''
\end{quote} and especially
\begin{quote}
``3.72 \verb|$\times$| 10 \verb|${}^{10}$| cm \verb|${}^{-2}$|''.
\end{quote}
Example 1 from Table~\ref{tab:complexcat} is also a remarkable case, because here,
we see a difference between the semantics of the LaTeX
source and the rendered result.
From its source, we can assume the meaning \quote{$\rm (W/cm)^2$}, while the presentational part
seems more consistent with \quote{$\rm W/(cm^2)$}. The latter is correct here, because the expression describes
From its source, we can assume the meaning \mq{$\rm (W/cm)^2$}, while the presentational part
seems more consistent with \mq{$\rm W/(cm^2)$}. The latter is correct here, because the expression describes
the intensity of a laser beam.
......
We now present the architecture of the implementation in Figure~\ref{fig:architecture} and
consists of three disjoint parts, where the first one is a small but important preprocessing step.
Quantity expressions are spotted as a second step. The results are stored
as separate annotations which contain references to the documents.
Rating the likelihood of the annotations is a subtask of this step. This is especially relevant, when there is more than one annotation -- this means more than one possible meaning -- for an expression. We will refer to a program as \textit{spotter}, if it creates annotations from the document. In addition to that, we will call a program
\textit{scorer}, when it is working on existing annotations and manipulates their value of likelihood.
The architecture permits multiple spotting and scoring routines and can easily be extended additional ones.
Semantic services are based on the documents and their annotations and form the
last part.
The structure of this section follows the structure of the architecture. We thus first describe the preprocessing (Section~\ref{ssec:tokenization}), then the spotting and scoring of quantity expressions including the output format (Section~\ref{ssec:spotting}) and at the end the semantic services (Section~\ref{ssec:implunitconv} to~\ref{ssec:implharvest}).
\begin{figure}
\begin{figure}[ht]
\begin{tikzpicture}[
>=stealth,
node distance=3cm,
......@@ -55,6 +41,22 @@ The structure of this section follows the structure of the architecture. We thus
\label{fig:architecture}
\end{figure}
We present the architecture of our implementation in Figure~\ref{fig:architecture}.
It consists of three disjoint parts, where the first one is a small but important preprocessing step.
Quantity expressions are spotted as a second step. The results are stored
as separate annotations which contain references to the documents.
Rating the likelihood of the annotations is a subtask of this step. This is especially relevant, when there is more than one annotation -- this means more than one possible meaning -- for an expression. We will refer to a program as \textit{spotter}, if it creates annotations from the document. In addition to that, we will call a program
\textit{scorer}, when it is working on existing annotations and manipulates their value of likelihood.
The architecture permits multiple spotting and scoring routines and can easily be extended by additional ones.
Semantic services are based on the documents and their annotations and form the
last part.
The structure of this section follows the structure of the architecture. We thus first describe the preprocessing (Section~\ref{ssec:tokenization}), then the spotting and scoring of quantity expressions including the output format (Section~\ref{ssec:spotting}) and at the end the semantic services (Section~\ref{ssec:implunitconv} to~\ref{ssec:implharvest}).
\subsection{Text Tokenization}
\label{ssec:tokenization}
......@@ -326,10 +328,10 @@ at the end:
\end{itemize}
For instance, for the input [(GHz, 1)], there are two possible meanings
found; one is Gigahertz and the other one is Gaus $\cdot$
found; one is Gigahertz and the other one is Gauß $\cdot$
Hertz. This is due to Rule 1, which says that after a unit symbol (Hz)
was found, the parsing continues to search for both a prefix (Giga) and
a unit symbol (Gaus). In this case, both attempts are successful. Of
a unit symbol (Gauß). In this case, both attempts are successful. Of
course, Gigahertz is much more likely to be meant here, but without any
further knowledge, the other option cannot be ruled out completely.
......
......@@ -62,7 +62,8 @@
}
\def\inline{\lstinline[language=XML]}
\def\memph#1{``\textit{#1}''}
\def\quote#1{``#1''}
\def\mq#1{``#1''}
\def\meta#1{\hspace{1mm}\ensuremath{\langle\langle\hspace{1mm}\text{#1}\hspace{1mm}\rangle\rangle}\hspace{1mm}}
\title{Meaning Extraction and Semantic Services in STEM-Documents}
......
<math xmlns="http://www.w3.org/1998/Math/MathML">
<semantics>
<mrow>Presentation MathML</mrow>
<annotation-xml>Content MathML</annotation-xml>
<mrow>(*\meta{Presentation MathML}*)</mrow>
<annotation-xml>(*\meta{Content MathML}*)</annotation-xml>
<annotation encoding="application/x-tex">
LaTeX
(*\meta{LaTeX}*)
</annotation>
</semantics>
</math>
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment