Commit 82efb353 authored by Ulrich's avatar Ulrich

thesis final version

parent 26445913
......@@ -88,7 +88,7 @@
\frametitle{Einleitung}
\begin{itemize}[label=$\bullet$]
\item erste Messungen vor mindestens 30 000 Jahren~\cite{haustein2001weltchronik}
\item Entwicklung "`menschlicher"' Einheiten -- z.B. Fuß und Elle
\item Entwicklung \mq{menschlicher} Einheiten -- z.B. Fuß und Elle
\item Meile (1609 Meter), Seemeile (1852 Meter), Landmeile (7532 Meter), \dots\ -- Meilenschwindel~\cite{sippel2001}
\item Metrisches vs. angloamerikanisches System
\item SI-System -- Meter, Sekunde, Kilogramm, Ampere, Kelvin, Mol, Candela
......@@ -98,6 +98,7 @@
\begin{frame}
\frametitle{Beitrag dieser Arbeit}
Analyse von arXiv-Dokumenten:
\begin{itemize}[label=$\bullet$]
\item Wie schreiben Wissenschaftler Mengenangaben in LaTeX?
\item Erkennung von Mengenangaben in MINT-Dokumenten
......@@ -122,7 +123,7 @@ Einfache Multiplikative MA:
$5\text{GHz}$ \> \verb|$5\text{GHz}$| \\
0.45 eV \> \verb|0.45 eV| \\
200 $\mu$m \> \verb|200 $\mu$m| \\
10$^{-4}$ M$_\odot$ \> \verb|10$^{-4}$ M$_\odot$|
% 10$^{-4}$ M$_\odot$ \> \verb|10$^{-4}$ M$_\odot$|
\end{tabbing}
\pause
Einfache divisionsbasierte MA:
......@@ -285,9 +286,9 @@ Lediglich Einheiten:
\begin{frame}[fragile]
\frametitle{Phänomenologie -- Mehrdeutigkeiten}
\begin{tabular}{lll}
5 Pa & \hspace*{2.5cm} & \onslide<3->{5 GHz} \\
\hspace*{.5cm} \onslide<2->{5 Pascal} & & \hspace*{.5cm} \onslide<3->{5 Gigahertz} \\
\hspace*{.5cm} \onslide<2->{5 Petajahre} & & \hspace*{.5cm} \onslide<3->{5 Gauß $\cdot$ Hertz} \\
5 Pa & \hspace*{2.5cm} & \onslide<3->{5 mN} \\
\hspace*{.5cm} \onslide<2->{5 Pascal} & & \hspace*{.5cm} \onslide<3->{5 Millinewton} \\
\hspace*{.5cm} \onslide<2->{5 Petajahre} & & \hspace*{.5cm} \onslide<3->{5 Meter $\cdot$ Newton} \\
& & \\
\onslide<4->{$0.5$ eV/Å$\;{}^{3}$} & & \\
\hspace*{.5cm} \onslide<4->{$0.5$ (eV/Å)$^3$} & & \\
......@@ -298,9 +299,9 @@ Lediglich Einheiten:
\begin{frame}[fragile]
\frametitle{Phänomenologie -- Mehrdeutigkeiten}
\begin{tabular}{lll}
5 Pa & \hspace*{2cm} & \onslide<1->{5 GHz} \\
\hspace*{.5cm} \onslide<1->{5 Pascal} & & \hspace*{.5cm} \onslide<1->{5 Gigahertz} \\
\hspace*{.5cm} \onslide<1->{5 Petajahre} & & \hspace*{.5cm} \onslide<1->{5 Gauß $\cdot$ Hertz} \\
5 Pa & \hspace*{2cm} & \onslide<1->{5 mN} \\
\hspace*{.5cm} \onslide<1->{5 Pascal} & & \hspace*{.5cm} \onslide<1->{5 Millinewton} \\
\hspace*{.5cm} \onslide<1->{5 Petajahre} & & \hspace*{.5cm} \onslide<1->{5 Meter $\cdot$ Newton} \\
& & \\
\onslide<1->{$0.5$ eV/Å$\;{}^{3}$} & & \verb|$1.0\text{Wcm}^{-2}$| \\
\hspace*{.5cm} \onslide<1->{$0.5$ (eV/Å)$^3$} & & \hfill -- $1.0\text{Wcm}^{-2}$ \\
......@@ -330,7 +331,7 @@ Lediglich Einheiten:
\end{itemize}
\end{frame}
\section{MathML}
%\section{MathML}
%\begin{frame}
% \frametitle{Grundlagen und Werkzeuge}
......@@ -344,23 +345,23 @@ Lediglich Einheiten:
% \item Von LaTeXML generierte HTML-Version des e-print Archivs arXiv
% \end{itemize}
%\end{frame}
\begin{frame}
\frametitle{MathML}
\begin{figure}
\begin{mdframed}
\begin{subfigure}{.3\textwidth}
\lstinputlisting[language=XML, basicstyle=\scriptsize]{xml/presMathMLExample2.xml}
\end{subfigure}
\hfill
\begin{subfigure}{.45\textwidth}
\onslide<2->{\lstinputlisting[language=XML, basicstyle=\scriptsize]{xml/contentMathMLExample2.xml}}
\end{subfigure}
\\
{\footnotesize \begin{center} \mq{$g(x + y)$} \end{center}} %\begin{center} \mq{$3.0 \cdot 10^{-17} \si{\micro\meter}$} \end{center}}
\end{mdframed}
\end{figure}
\end{frame}
%
%\begin{frame}
% \frametitle{MathML}
%\begin{figure}
%\begin{mdframed}
% \begin{subfigure}{.3\textwidth}
% \lstinputlisting[language=XML, basicstyle=\scriptsize]{xml/presMathMLExample2.xml}
% \end{subfigure}
% \hfill
% \begin{subfigure}{.45\textwidth}
% \onslide<2->{\lstinputlisting[language=XML, basicstyle=\scriptsize]{xml/contentMathMLExample2.xml}}
% \end{subfigure}
% \\
% {\footnotesize \begin{center} \mq{$g(x + y)$} \end{center}} %\begin{center} \mq{$3.0 \cdot 10^{-17} \si{\micro\meter}$} \end{center}}
%\end{mdframed}
%\end{figure}
%\end{frame}
\section{Semantikextraktion}
......@@ -379,15 +380,17 @@ Lediglich Einheiten:
% cylinder end fill=yellow!50,
shape border rotate=90,
aspect=0.25,
scale=0.8,
scale=0.9,
draw,
}
]
\node[database] (newDB) {Mod. Corpus};
\node[rectangle, draw, above=.5cm of newDB] (tok) {Vorverarbeitung};
\node[rectangle, draw, above=.5cm of newDB] (tok) {\small Vorverarbeitung};
\node[database, below=1.5cm of newDB] (anDB) {Annotationen};
\node[database, above=.5cm of tok] (oldDB) {arXMLiv-Corpus};
\node[database, above=.5cm of tok] (oldDB) {arXMLiv};
\node[rectangle, draw, right=.7cm of oldDB] (latexml) {LaTeXML};
\node[database, right=.7cm of latexml] (arxiv) {ArXiv};
\node[rectangle, draw, below left=0.75cm of newDB] (spot) {Spotter/Scorer};
\node[rectangle, draw, right = 2.5cm of newDB] (conv) {Autom. Konversion};
\node[rectangle, draw, below =.75cm of conv] (blind) {Screen Reader};
......@@ -406,7 +409,9 @@ Lediglich Einheiten:
\draw[-{latex}] (invis2) |- (conv);
\draw[-{latex}] (invis2) |- (blind);
\draw[-{latex}] (invis2) |- (search);
\draw[-{latex}] (arxiv) -- (latexml);
\draw[-{latex}] (latexml) -- (oldDB);
\end{tikzpicture}
}
%\caption{The architecture of the implementation.}
......@@ -812,8 +817,8 @@ Lediglich Einheiten:
\begin{tabular}{ll}
\onslide<1->{[(\textmu, 1), (m, 1)]} &
\onslide<3->{\mq{Micrometer}} \\
\onslide<1->{[(G, 1), (e, 1), (V, 1)]} &
\onslide<4->{$\rm G \cdot eV$, $\rm GeV$} \\
\onslide<1->{[(m, 1), (N, 1)]} &
\onslide<4->{\mq{Millinewton}, \mq{Meter $\cdot$ Newton}} \\
\onslide<1->{[(Wcm, -2)]} &
\onslide<5->{\mq{$\rm W^{-2}cm^{-2}$},
\mq{$\rm W^{1}cm^{-2}$}} \\
......@@ -879,24 +884,24 @@ Lediglich Einheiten:
\footnotesize
Regelbasiertes System um Präferenzen für Bedeutungen festzulegen:
\begin{itemize}[label=$\bullet$]
\item bevorzuge Ausdrücke mit Präfixen
\item bevorzuge \mq{lange} Einheiten und Ausdrücke mit Präfixen
\begin{itemize}
\item \mq{$\rm GeV$} statt \mq{$\rm G \cdot eV$}
\item \mq{$\rm mm$} statt \mq{$\rm m \cdot m$}
\item \mq{Pascal} statt \mq{Petajahre} für \mq{Pa}
\item \mq{Millinewton} statt \mq{Meter $\cdot$ Newton} für \mq{mN}
\end{itemize}
\pause
\item bevorzuge Bedeutungen, bei denen sich der Exponent nur auf
die letzte Einheit bezieht
\begin{itemize}
\item \mq{$\rm W (cm^2)$} statt \mq{$\rm (W cm)^2$} für \verb|$\text{Wcm}^2$|
\item \mq{$\rm W (cm^2)$} statt \mq{$\rm (W cm)^2$} für \verb|$\text{Wcm}^2$|
\end{itemize}
\pause
\item ``Eliminiere'' deklarierte Variablen~\cite{janbsc}
\item \mq{Eliminiere} deklarierte Variablen~\cite{janbsc}
\begin{itemize}
\item \mq{[\dots] for each value of $\lambda, \delta, g$ and $N$ [\dots] $10 N$ [\dots]}
\end{itemize}
\pause
\item ``Eliminiere'' Attosekunden
\item \mq{Eliminiere} Attosekunden
\begin{itemize}
\item \mq{[\dots] $\langle\phi\rangle=0$ as the temperature drops below [\dots]}
\end{itemize}
......@@ -910,7 +915,7 @@ Regelbasiertes System um Präferenzen für Bedeutungen festzulegen:
\frametitle{Evaluation}
\begin{itemize}[label=$\bullet$]
\item erfolgreich 35000 Dokumente verarbeitet
\item Laufzeit: 80 Stunden auf 9x2.00 GHz Prozessoren
%\item Laufzeit: 80 Stunden auf 9x2.00 GHz Prozessoren
\item manuelle Auswertung von 50 Dokumenten
\end{itemize}
\pause
......
......@@ -2,6 +2,8 @@ The author implemented three prototypical applications to demonstrate the benefi
The first is a service for the conversion of units in documents (Section~\ref{ssec:implunitconv}).
We then describe an enhancement for screen reading applications with semantic information (Section~\ref{ssec:implscreen}).
In Section~\ref{ssec:implharvest}, we explain how to exploit our results for a semantic search engine.
The semantic services are also displayed in Figure~\ref{fig:architecture} which describes the architecture of the
implementation.
\subsection{Conversion inside Documents}
......@@ -13,7 +15,7 @@ This part of the author's work has been already been published as a part of a
report about OpenDreamKit~\cite{opendreamkitreport} and we follow the presentation of this report.
\subsubsection{Use Case}
In order to convert a document from one system of measurement to another, a user has to open a
In order to convert a document from one system of measurement to another, a user currently has to open a
conversion service and enter the data. This is not only a distraction from the document, but also
a source of errors due to the manual entering of the data.
Automatic unit conversion in documents can solve these problems. Imagine, for instance, a recipe
......@@ -32,7 +34,7 @@ Figure~\ref{fig:highlight} shows the result of this operation and we further ref
\begin{figure}[ht]
\fbox{\includegraphics[width=\textwidth]{screenshots/highlight}}
\caption{Highlighting spotted Quantity Expressions in~\cite{physics/9807021}. Figure from~\cite{opendreamkitreport}.}
\caption{Highlighting Spotted Quantity Expressions in~\cite{physics/9807021}. Figure from~\cite{opendreamkitreport}.}
\label{fig:highlight}
\end{figure}
......@@ -157,10 +159,10 @@ descriptions to nodes in HTML5 documents for accessibility purposes~\cite{Craig:
\fbox{\lstinputlisting[language=XML]{xml/srspan.xml}}
\caption{An Example of \inline{span} Nodes wrapped in a \inline{div} Node.}\label{fig:srspan}
\end{subfigure}
\caption{Different Possibilities of adding aria labels to nodes in HTML5.}\label{fig:sr}
\caption{Different Possibilities of Adding Aria Labels to Nodes in HTML5.}\label{fig:sr}
\end{figure}
For the first method, we added aria labels to presentation MathML nodes as in
As a first approach, we added aria labels to presentation MathML nodes as in
Figure~\ref{fig:srdeep}. We refer to this approach as \textit{deep labeling}, as it adds
the labels directly to the corresponding nodes. In addition to that,
we have tried to annotate whole expression by wrapping them in labeled \inline{div}
......@@ -190,7 +192,7 @@ presentation MathML and \inline{span} nodes in \inline{div} nodes.
As a workaround, we convert spotted quantity expressions to presentation MathML, wrap them in \inline{div}
nodes and attach meaningful aria labels for the expression.
Only stand-alone quantity expressions can be process this way.
Like this, we cannot handle quantity expressions that are part of formulae since this would imply do destroy the
Like this, we cannot handle quantity expressions that are part of formulae since this would imply to destroy the
structure of the corresponding MathML term which is not desirable.
With NVDA and Internet Explorer it is now possible to read, say, \mq{three solar masses} and switching to JAWS allows
to hear the encoding of the expression; here \mq{three M sub circled dot operator}.
......@@ -230,7 +232,7 @@ During the conversion, we omit quantity expressions with a negative score.
\begin{figure}
\lstinputlisting[language=XML, frame=single]{xml/tom.xml}
\centering
\caption{The general Frame of Harvest Files for Tom Wiesing's Search Engine.}
\caption{The General Frame of Harvest Files for Tom Wiesing's Search Engine.}
\label{fig:tomharvest}
\end{figure}
......
......@@ -15,7 +15,7 @@ We assume basic knowledge of LaTeX, HTML and XML.
explicitly denoting the semantics of an expression. Figure
\ref{fig:generalMathML} shows the general frame for an expression in MathML
which always starts with a \inline{math} tag and the corresponding namespace.
The \inline{semantics} child is an container whose first child is usually an
The \inline{semantics} child is a container whose first child is usually an
expression in presentation MathML, followed by annotations for this element.
An annotation can be in XML, for instance content MathML, or it can be any
sequence of characters, for example LaTeX code.
......@@ -44,7 +44,7 @@ We assume basic knowledge of LaTeX, HTML and XML.
identifiers (\inline{mi}) and text (\inline{mtext}), while layout schemata
include horizontal grouping (\inline{mrow}), superscript (\inline{msup}) and
subscript (\inline{msub}). Tags can be modified by attributes, such as the
\tt{mathvariant} attribute. Its value \tt{normal} ensures a upright
\tt{mathvariant} attribute. Its value \tt{normal} ensures an upright
font for the correct presentation of a unit in MathML -- for details compare
the W3C note about units in MathML~\cite{Devitt:03:UM}.
......@@ -72,7 +72,7 @@ We assume basic knowledge of LaTeX, HTML and XML.
\begin{figure}
\lstinputlisting[language=XML,frame=single]{xml/contentMathMLExample.xml}
\centering
\caption{A content MathML expression for $3.0 \cdot 10^{-17}\si{\micro\metre}$.}
\caption{A Content MathML Expression for \mq{$3.0 \cdot 10^{-17}\si{\micro\metre}$}.}
\label{fig:contentMathMLexample}
\end{figure}
......@@ -92,8 +92,8 @@ We assume basic knowledge of LaTeX, HTML and XML.
formula.
LaTeXML was used to convert the online e-print archive, arXiv, from LaTeX
to HTML5~\cite{stamerjohanns2010transforming}. The resulting corpus is called arXMLiv. The
arXiv contains articles from areas such as physics, mathematics, computer science,
to HTML5. The resulting corpus is called arXMLiv~\cite{stamerjohanns2010transforming}. The
arXiv\footnote{http://www.arxiv.org} contains articles from areas such as physics, mathematics, computer science,
quantitative biology, quantitative finance and statistics. The articles are mostly uploaded
as LaTeX files and published as PDF and Postscript documents. PDF and Postscript files are
obviously impractical for any further processing, but also the LaTeX source is not intended for
......@@ -125,7 +125,7 @@ We assume basic knowledge of LaTeX, HTML and XML.
\begin{figure}
\includegraphics[scale=0.4]{KAT.png}
\caption{Creating an annotation in KAT (screenshot from~\cite{KATCICM16}).}
\caption{Creating an Annotation in KAT (Screenshot from~\cite{KATCICM16}).}
\label{fig:KAT}
\end{figure}
......
......@@ -125,7 +125,9 @@ end of this section.
In this section we discuss the perception of the examples from the previous section.
At first, we look at the presentational part and then at the corresponding LaTeX source.
From the rendered result, we observe that unit strings can be ambiguous.
Marcus Foster pointed out that unit strings can be ambiguous
(compare related work in Section~\ref{ssec:relatedwork}) and
we agree with his observation.
For instance \mq{GHz} is very likely to stand for \mq{gigahertz}, but could also denote
\mq{gauß\footnote{Gauß is a unit of magnetic induction, commonly
abbreviated as \mq{G}.}~$\cdot$~hertz}. Similarly, \mq{Pa} has two possible meanings --
......
This diff is collapsed.
......@@ -159,6 +159,9 @@ Prof. Dr. Michael Kohlhase \\ % Supervisor's Name
\end{titlepage}
\hfill
\newpage
%\maketitle
\mbox{}\vspace{0.4\textheight}
......@@ -176,13 +179,23 @@ gekennzeichnet.
\bigskip
Erlangen, \today
Erlangen, 7. Juni 2017 %\today
\newpage
\hfill
\newpage
\tableofcontents
\newpage
\listoffigures
\newpage
\listoftables
\newpage
\pagenumbering{arabic}
\section{Introduction}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment