Commit 2d2e9241 authored by Ulrich's avatar Ulrich

strange error in scoring

parent a3b52674
[package]
name = "spotter"
version = "0.0.1"
authors = ["jfschaefer <j.schaefer@jacobs-university.de>, urabenstein <ulrich.rabenstein@fau.de>"]
authors = ["urabenstein <ulrich.rabenstein@fau.de>"]
[lib]
name = "spotter_lib"
......
......@@ -54,7 +54,7 @@ pub fn search_dir(dir_path : &Path, original_path : &str, output_path : &str, co
println!("Scoring {}", name);
let filename = string.split("/").last().unwrap();
let filename_without_suffix = filename.replace(".kat.xml","");
println!("File {}", filename_without_suffix);
// println!("File {}", filename_without_suffix);
let harvest = read_document(&string, config, &filename_without_suffix);
let dir = &relative_path[..(relative_path.len()-filename.len())];
......@@ -64,9 +64,9 @@ pub fn search_dir(dir_path : &Path, original_path : &str, output_path : &str, co
output = output.replace(".kat.xml",".harvest");
println!("output localtion {}", output);
println!("relativ path {}", relative_path);
println!("dir {}", output_dir);
// println!("output localtion {}", output);
// println!("relativ path {}", relative_path);
// println!("dir {}", output_dir);
let r = std::fs::create_dir_all(&output_dir);
if r.is_err(){
......
......@@ -42,21 +42,29 @@ pub fn main() {
let pool = ThreadPool::new(CORES);
let (tx, rx) = channel();
let number_of_documents = doc_vec.len();
let mut i = 1;
for my_annotation_path in doc_vec.clone(){
let my_tx = tx.clone();
let my_config = config.clone();
let my_document_path = document_path.clone();
let my_input_path = input_path.clone();
pool.execute(move || {
// pool.execute(move || {
read_document(&my_annotation_path, Path::new(&my_input_path), Path::new(&my_document_path), &my_config);
my_tx.send(my_annotation_path).unwrap();
});
println!("finished ({}/{}) {}", i, number_of_documents, document_path.clone());
i = i +1;
// });
}
let mut successful_documents = vec![];
let number_of_documents = doc_vec.len();
/*
println!("number of documentss {}", number_of_documents);
let mut successful_documents = vec![];
let timeout = Duration::from_secs(200);
for i in 0..number_of_documents{
......@@ -69,6 +77,9 @@ pub fn main() {
println!("timeout");
}
}
*/
println!("done");
let mut doc_panic = vec![];
......@@ -177,7 +188,7 @@ pub fn read_document(s : &str, old_path : &Path, document_path : &Path, config :
println!("find content");
find_content(&mut root, &doc, &orig_doc, config, &declaration_symbols);
doc.save_file(&s).unwrap();
}
......@@ -199,15 +210,18 @@ fn find_content(node : &mut Node, annotation_document : &Document, original_docu
let mut c_unw = c.unwrap();
if c_unw.get_name().eq("contentmathml"){
println!("contentmathml");
let cc = c_unw.get_first_child();
//attach an old score, if there is one
if opt_score.is_some(){
println!("opt_score");
c_unw.remove_property("score");
c_unw.add_property("score", opt_score.unwrap());
}else {
println!("not opt_score");
if let Some(child) = cc {
let opt_qe = parse_cnml(&child, annotation_document);
......@@ -220,6 +234,7 @@ fn find_content(node : &mut Node, annotation_document : &Document, original_docu
c_unw.remove_property("score");
c_unw.add_property("score", &decl_score.unwrap());
} else {
println!("no declaration score");
let inv_score = 1.0 / score_annotation(&qe);
let as_score = bad_score_for_as(&qe);
......@@ -234,12 +249,13 @@ fn find_content(node : &mut Node, annotation_document : &Document, original_docu
}
}
}else if c_unw.get_name().eq("annotates"){
println!("annotates");
//eliminate annotations when they occur in the bibliography section
let url = c_unw.get_property("resource").unwrap();
// let container = url.split("'").nth(1).unwrap();
let start = url.split("'").nth(3).unwrap();
let start = url.split("\'").nth(3).unwrap();
// let end = url.split("'").nth(5).unwrap();
let query = format!("//*[@id='{}']", start);
let xpath_context = libxml::xpath::Context::new(&original_document.dom).unwrap();
......@@ -269,6 +285,7 @@ fn find_content(node : &mut Node, annotation_document : &Document, original_docu
let opt_new_qe = add_meaning_for_as(&vec);
if let Some(mut new_qe) = opt_new_qe{
println!("add something");
let mut new_node = Node::new("kat:contentmathml", None, annotation_document).unwrap();
new_node.add_property("rdf:parseType","Literal");
......
......@@ -116,8 +116,8 @@ We assume basic knowledge of LaTeX and HTML.
The KWARC Annotation Tool (KAT,~\cite{KATCICM14} and~\cite{KATCICM16}) is an annotation tool
with support
for mathematics which is implement in JavaScript and executed by a browser.
It allows the definition of annotation formats in the form of so called
KAT Annotation Specifications (KAnnSpec). Besides an annotation mode, the tool also contains
It allows the definition of annotation formats in the form of
\textit{KAT Annotation Specifications} (KAnnSpec). Besides an annotation mode, the tool also contains
a review mode, in which a user can rate existing annotations -- for instance for the
evaluation of an automated annotation system.
Annotations can be stored as Resource Description Framework (RDF,~\cite{RDFPrime}) documents
......
There are various different ways of expressing the same quantity expression, even when it consists
of the same units. Compare, for instance, 5 m, 5 meter and five meter, which are all equivalent
of the same units. Compare, for instance, \quote{5 m}, \quote{5 meter} and \quote{five meter}, which are all equivalent
ways of stating the exactly same information.
The documents, we are working with, were originally written in LaTeX which also offers a wide variety of
The documents, we are working with, were originally written in \LaTeX\ which also offers a wide variety of
possibilities to typeset the same expression. Thus the goal of this section is to present concrete examples of
quantity expressions and units, which the author observed during his work
on this thesis, and to introduce a categorization for them (Section~\ref{ssec:categorication}).
......@@ -14,12 +14,13 @@ end of this section.
\subsection{Categorization of Quantity Expressions and Units}
\label{ssec:categorication}
\input{tex/tables/simpmucat.tex}
We distinguish quantity expressions from a syntactic and a semantic point
of view and for instance have a category containing relatively plain
expressions with units without superscripts (i.e.
$1\;\text{m}$; syntactically) and a category containing range expressions
for arbitrary units (i.e. $1 - 2\;\text{kg}$; semantically).
\quote{$1\;\rm m$}; syntactically) and a category containing range expressions
for arbitrary units (i.e. \quote{$1 - 2\;\rm kg$}; semantically).
This intentionally does not lead to a strictly disjoint categorization,
as the examples demonstrate.
For every example, we show the presentational part of the quantity expressions as well as
......@@ -27,47 +28,48 @@ end of this section.
It is sufficient to present short LaTeX snippets here, since the same observations
apply to the HTML code which is created from the LaTeX source.
\input{tex/tables.tex}
\input{tex/tables/simpdivcat.tex}
The most basic category is the one just mentioned for \textit{simple
multiplicative quantity expressions} (Table~\ref{tab:simpmutcat}).
We regard a quantity
expression as simple, if it contains any kind of numeric expression
followed by one or more unit symbols in a multiplicative way (i.e.
$3\;\text{Nm}$). This excludes unit symbols, which are in superscript, like
$30^\circ$, and we also exclude written-out unit symbols (i.e. meter) and
textual numbers for this class.
\quote{$3\;\text{Nm}$}). This excludes unit symbols, which are in superscript, like
\quote{$30^\circ$}, and we also exclude written-out unit symbols (i.e. meter) and
textual numbers for this class. \input{tex/tables/complexcat.tex}
Instead there is a separate category for textual unit symbols and
quantity expressions with textual numbers (Table~\ref{tab:textualcat}).
In addition to the first category, we introduce one for \textit{simple
divisive quantity expressions} (Table~\ref{tab:simpdivcat}), which differs
only by the fact that units may also occur in a divisive way (i.e.
$4\;\text{m/s}$).
\quote{$4\;\text{m/s}$}). \input{tex/tables/superscriptcat.tex}
We extend the simple multiplicative and divisive quantity expressions
to the category for \textit{complex quantity expressions}
(Table~\ref{tab:complexcat}) which contains expressions from the former
categories, but allows additional superscripts for units (i.e.
$5\;\text{m}^2$ or $5\;\text{m/s}^2$). This category subsumes the first two.
\quote{$5\;\text{m}^2$} or \quote{$5\;\text{m/s}^2$}). This category subsumes the first two.
\input{tex/tables/textualcat.tex}
Additionally, we have a category for quantity expressions where unit symbols
are part of the superscript -- as in $30^\circ$. Although written in their own
are part of the superscript -- as in \quote{$30^\circ$}. Although written in their own
manner, these expressions are, from a semantic point of view,
closely related to the simple multiplicative units.
closely related to the simple multiplicative units. \input{tex/tables/rangecat.tex}
Furthermore, there are extra classes for \textit{range expressions}
(Table~\ref{tab:rangecat}) and \textit{unit products}
(Table~\ref{tab:unitprodcat}) because they include the previous expression, but contain additional semantic
information.
For instance $23\;\mu\text{m} \times 23\;\mu\text{m}$
describes not only an area of size $529\;\mu\text{m}^2$, but also contains the information
that the area is quadratic and has an edge length of $23\;\mu\text{m}$.
information. \input{tex/tables/unitprodcat.tex}
For instance \quote{$23\;\mu\text{m} \times 23\;\mu\text{m}$}
describes not only an area of size \quote{$529\;\mu\text{m}^2$}, but also contains the information
that the area is quadratic and has an edge length of \quote{$23\;\mu\text{m}$}.
Additional information is necessary to handle quantity expressions involving constants (Table~\ref{tab:constantcat}).
For instance, we need to know that $\Omega_{a}$ ``is the ratio of the axion energy density to the
critical density in the Universe'' \cite{hep-ph/9807232} and that $h$ is the Hubble constant to understand
For instance, we need to know that \memph{$\Omega_{a}$ is the ratio of the axion energy density to the
critical density in the Universe}~\cite{hep-ph/9807232} and that \quote{$h$} is the Hubble constant to understand
the meaning of Example 2 of
this table.
this table. \input{tex/tables/constantcat.tex}
The examples in Table~\ref{tab:onlyunitcat} do not describe quantity expressions, but
depict that certain formulae are written in these units. Hence these terms also form their own class.
Figure~\ref{fig:taxonomy} summarizes the relations between the categories in a taxonomy.
\input{tex/tables/onlyunitcat.tex}
\begin{figure}
\begin{tikzpicture}[
>=stealth,
......@@ -115,11 +117,11 @@ end of this section.
In this section we discuss how we perceive the examples from the previous section.
At first, we look at the presentational part and then at the corresponding LaTeX source.
From the rendered result, we observe that unit string can be ambiguous.
For instance ``GHz'' is very likely to stand for ``Gigahertz'', but could also denote
``Gauß\footnote{Gauß is a unit of magnetic induction, commonly
abbreviated as ``G''.}~$\cdot$~Hertz''. Similarly, ``Pa'' has two possible meanings --
``Petayear'' and ``Pascal''.
From the rendered result, we observe that unit strings can be ambiguous.
For instance \quote{GHz} is very likely to stand for \quote{Gigahertz}, but could also denote
\quote{Gauß\footnote{Gauß is a unit of magnetic induction, commonly
abbreviated as \quote{G}.}~$\cdot$~Hertz}. Similarly, \quote{Pa} has two possible meanings --
\quote{Petayear} and \quote{Pascal\footnote{Pascal is a unit of pressure, commonly abbreviated as \quote{Pa}.}}.
Additionally, it is not always clear to which part of the expression exponents refer to.
In Example 4 from Table~\ref{tab:complexcat}, $0.5$ eV/Å${}^{3}$, the exponent might refer
only to Ångström, but also to the whole expression. Hence we have the two possible meanings
......@@ -139,25 +141,6 @@ end of this section.
From its source, we can assume the meaning $(\text{W}/\text{cm})^2$, while the presentational part
intends $\text{W}/(\text{cm}^2)$. The latter is correct here, because the expression describes
the intensity of a laser beam.
% From the examples in the previous section, we immediately
% observe the mixture of text and math mode.
% A spotter that works completely in math mode or completely in text mode
% does thus not seem to be very useful. Instead this requires an uniform
% approach being able to handle the changes between math and text mode.
%
%% Furthermore, we realise that the meaning of some examples is not unambiguous.
%% For instance, in Example 4 from Table~\ref{tab:complexcat}, it is --
%% without any further information -- uncertain, whether its author intended
%% the meaning $\text{eV}/(\text{\AA}^3)$ or $(\text{eV}/\text{\AA})^3$.
%
% Furthermore, we observe that unit strings can be ambiguous.
% For instance, ``GHz'' is very likely to stand for
% ``gigahertz'', but could also denote
% ``gaus\footnote{Gaus is a unit of magnetic induction, commonly
% abbreviated as ``G''.}~$\cdot$~hertz''. In the same way, ``Pa'' has two
% possible meanings -- ``petayear'' and ``pascal''.
%
......
\begin{table}
\center
\begin{tabular}{|c|c|p{8.5cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & $5GHz$ & \verb|$5GHz$| &
\cite{cond-mat/9807111} \\
2 & $5\text{GHz}$ & \verb|$5\text{GHz}$|
& \\
3 & $10^{-10}s$ & \verb|$10^{-10}s$| & \cite{cond-mat/9807111} \\
4 & $6 \mu \text{m}$ & \verb|$6 \mu \text{m}$| &
\cite{physics/9807021} \\
5 & 0.45 eV & 0.45 eV & \cite{cond-mat/9807211}\\
6 & 200 $\mu$m & 200 \verb|$\mu$|m & \cite{cond-mat/9807012} \\
7 & $25$M$\Omega$ & \verb|$25$|M\verb|$\Omega$| &
\cite{cond-mat/9807012} \\
8 & $0.6M_\odot$ & \verb|$0.6M_\odot$|
& \cite{astro-ph/9807152} \\
9 & 0.6 $M_\odot$ & 0.6 \verb|$M_\odot$|
& \cite{astro-ph/9807152} \\
10 & 10$^{-4}$ M$_\odot$ & 10\verb|$^{-4}$| M\verb|$_\odot$| & \cite{astro-ph/9211002} \\
\hline
\end{tabular}
\caption{Examples of the Category of Simple Multiplicative Quantity
Expressions.}
\label{tab:simpmutcat}
\end{table}
\begin{table}
\center
\begin{tabular}{|c|c|p{8.5cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & $10^{20}/s$ & \verb|$10^{20}/s$| & \cite{hep-ph/9807232} \\
2 & $10^{17}$/s & \verb|$10^{17}$|/s & \cite{hep-ph/9807232} \\
3 & 20 fm/c & 20 fm/c & \cite{nucl-th/9807088} \\
\hline
\end{tabular}
\caption{Examples of the Category of Simple Divisive Quantity
Expressions.}
\label{tab:simpdivcat}
\end{table}
\begin{table}
\center
\begin{tabular}{|c|c|p{8.5cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & $1.0\cdot10^{17} \text{W/cm}^{2}$ &
\verb|$1.0\cdot10^{17}\text{W/cm}^{2}$|
& \cite{physics/9807021} \\
2 & $1.0\mbox{Wcm}^{-2}\mu\text{m}^{2}$ &
\verb|$1.0\mbox{Wcm}^{-2}| & \\
& & \verb|\mu\text{m}^{2}$|
& \cite{physics/9807021} \\
3 & $4.8\times 10^{9}$ cm${}^{-2}$ &
\verb|$4.8\times 10^{9}$| cm\verb|${}^{-2}$| &
\cite{cond-mat/9807235} \\
4 & $0.5$ eV/Å${}^{3}$ &
\verb|$0.5$| eV/Å\verb|${}^{3}$| &
\cite{cond-mat/9807211} \\
5 & $100$ km s${}^{-1}$ Mpc${}^{-1}$ &
\verb|$100$| km s\verb|${}^{-1}$| Mpc\verb|${}^{-1}$|
& \cite{hep-ph/9807232} \\
6 & 3.72 $\times$ 10 ${}^{10}$cm${}^{-2}$ &
3.72 \verb|$\times$| 10\verb|${}^{10}$|cm\verb|${}^{-2}$| &
\cite{cond-mat/9807235} \\
7 & $F_{-7}\times10^{-7}~{}{\rm ergsec}^{-1}{\rm cm}^{-2}$ &
\verb|$F_{-7} \times 10^{-7}| & \\
& & \verb|{\rm ergsec}^{-1}{\rm cm}^{-2}$| &
\cite{astro-ph/9211009} \\
8 & $(0.4 \; \rm GeV)^2$ & \verb|$(0.4 \;\rm GeV)^2$| & \cite{hep-ph/9211221} \\
9 & $10^{-34}{\rm gm/cm^{3}}$ & \verb|$10^{-34}{\rm gm/cm^{3}}$| & \cite{gr-qc/9211006} \\
\hline
\end{tabular}
\caption{Examples of the Category of Complex Quantity Expressions.}
\label{tab:complexcat}
\end{table}
\begin{table}
\center
\begin{tabular}{|c|c|p{8.5cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & $30^\circ$ & \verb|$30^\circ$| & \cite{physics/9807021} \\
2 & $00^{h}49^{m}37^{s}.71$ & \verb|$00^{h}49^{m}37^{s}.71$| &
\cite{astro-ph/9807152} \\
3 & $-29^{\circ}50^{\prime}58^{\prime\prime}.7$ &
\verb|$-29^{\circ}50^{\prime}| & \cite{astro-ph/9807152} \\ & &
\verb|58^{\prime\prime}.7$| & \\
4 & $0.025^{\circ}C$ & \verb|$0.025^{\circ}C$| & \cite{patt-sol/9807001} \\
5 & $2\;^{\circ}$ C & \verb|$2\;^{\circ}$|C & \cite{gr-qc/9211005} \\
6 & $10^{\prime\prime}$ & \verb|$10^{\prime\prime}$| & \cite{gr-qc/9211006} \\
\hline
\end{tabular}
\caption{Examples of the Category of Quantity
Expressions with Units in Superscript.}
\label{tab:superscriptcat}
\end{table}
\begin{table}
\center
\begin{tabular}{|c|c|p{7.9cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & five seconds & five seconds & \\
2 & 1.27 square degrees & 1.27 square degrees &
\cite{astro-ph/9807152} \\
3 & one GHz & one GHz & \cite{hep-ph/9211201} \\
\hline
\end{tabular}
\caption{Examples of the Category of Textual Quantity Expressions.}
\label{tab:textualcat}
\end{table}
\begin{table}
\center
\begin{tabular}{|c|c|p{7.5cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & $0.01-0.1\mu m^{2}$ & \verb|$0.01-0.1\mu m^{2}$| &
\cite{cond-mat/9807111} \\
2 & $0.53 \pm 0.01$ eV & \verb|$0.53 \pm 0.01$| eV
& \cite{cond-mat/9807211} \\
3 & 3.7–19 GeV & 3.7–19 GeV & \cite{astro-ph/9807288} \\
4 & 20 to 10,000 kilometers & 20 to 10,000 kilometers
& \cite{hep-ph/9211214} \\
\hline
\end{tabular}
\caption{Examples of the Category of Range Expressions.}
\label{tab:rangecat}
\end{table}
\begin{table}
\center
\begin{tabular}{|c|c|p{8.5cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & $23\;\mu\text{m}\times23\;\mu\text{m}$
&\verb|$23\;\mu\text{m}\times| & \cite{physics/9807021} \\ & &
\verb|23\;\mu\text{m}$| & \\
\hline
\end{tabular}
\caption{Examples of the Category of Quantity Products.}
\label{tab:unitprodcat}
\end{table}
\begin{table}
\center
\begin{tabular}{|c|c|p{8.5cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & $3 h^{-1}$ Gpc & \verb|$3 h^{-1}$ Gpc| & \cite{hep-ph/9211201} \\
2 & $10^{-12}M_{\odot}\Omega_{a}h^{2}$ & \verb|$10^{-12}M_{\odot}\Omega_{a}h^{2}$| & \cite{hep-ph/9807232} \\
\hline
\end{tabular}
\caption{Examples of the Category of Quantity
Expressions involving Constants.}
\label{tab:constantcat}
\end{table}
\begin{table}
\center
\begin{tabular}{|c|c|p{8.5cm}|c|}
\hline
Nr. & Rendered Result & LaTeX-source & Reference \\
\hline
1 & [...] $(\rm MeV/fm^{3})$ & \verb|(\rm MeV/fm^{3})| &
\cite{nucl-th/9807088} \\
2 & in GeV ${}^2$ & in GeV \verb|${}^2$| & \cite{hep-ph/9211221} \\
\hline
\end{tabular}
\caption{Examples of Single Units.}
\label{tab:onlyunitcat}
\end{table}
......@@ -40,7 +40,6 @@
%\addbibresource{literatur.bib}
\lstset{
basicstyle=\ttfamily,
columns=fullflexible,
......@@ -62,6 +61,8 @@
escapeinside={(*}{*)}
}
\def\inline{\lstinline[language=XML]}
\def\memph#1{``\textit{#1}''}
\def\quote#1{``#1''}
\title{Meaning Extraction and Semantic Services in STEM-Documents}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment