Commit ea14b646 authored by Ulrich's avatar Ulrich

screen reader parallel

parent 72b31b66
......@@ -3,8 +3,10 @@ import os
import copy
from bs4 import BeautifulSoup, Tag, NavigableString
from astropy import units as astropy_units
from concurrent.futures import ThreadPoolExecutor
translation = {"M_sun" : "solar masses", "AA" : "Angstrom"}
executor = ThreadPoolExecutor(max_workers=3)
def read_annotations(rdf_soup):
annotations = []
......@@ -629,17 +631,34 @@ def process(html_soup, rdf_soup, output_path):
sem = next(math.children)
if semantics_is_empty(sem) or next(sem.children).name == "annotation-xml":
math.decompose()
except AttributeError as err:
except _:
print("annotation error")
print(format(err))
html = html_soup.prettify("utf-8")
with open(output_path, "wb") as file:
file.write(html)
def process_file(path_file, html_path, rdf_path, output_path):
print("path " + path_file)
_relative_html = path_file[len(html_path):]
_relative_rdf = _relative_html.replace(".html",".kat.xml")
_path_rdf = rdf_path + "/" + _relative_rdf
_output_file = output_path + "/" + _relative_html
_output_folder = _output_file[:(len(_output_file) - _output_file[::-1].index("/"))]
print("output " + _output_folder)
if not os.path.isdir(_output_folder):
os.makedirs(_output_folder)
_html_file = open(path_file)
_rdf_file = open(_path_rdf)
_html_soup = BeautifulSoup(_html_file, 'html.parser')
_rdf_soup = BeautifulSoup(_rdf_file, 'html.parser')
try:
process(_html_soup, _rdf_soup, _output_file)
print("success: " + path_file)
except _:
print("error: " + path_file)
_html_file.close()
_rdf_file.close()
if len(sys.argv) != 4:
......@@ -655,6 +674,8 @@ print (html_path)
#rdf_file = open(sys.argv[2])
output_path = sys.argv[3]
#html_soup = BeautifulSoup(html_file, 'html.parser')
#rdf_soup = BeautifulSoup(rdf_file, 'html.parser')
......@@ -663,29 +684,24 @@ if not os.path.isdir(output_path):
exit()
if os.path.isdir(html_path) and os.path.isdir(rdf_path):
#get all html documents
#def process_file(root, f, html_path, rdf_path, output_path):
html_docs = []
for root, dirnames, filenames in os.walk(html_path):
for f in filenames:
if f.endswith(".html"):
path_file = root + "/" + f
relative_html = path_file[len(html_path):]
relative_rdf = relative_html.replace(".html",".kat.xml")
path_rdf = rdf_path + "/" + relative_rdf
output_file = output_path + "/" + relative_html
output_folder = output_path + "/" + root[len(html_path):]
print(output_folder)
if not os.path.isdir(output_folder):
os.makedirs(output_folder)
print(path_file)
print(path_rdf)
print(output_file)
print("\n")
html_file = open(path_file)
rdf_file = open(path_rdf)
html_soup = BeautifulSoup(html_file, 'html.parser')
rdf_soup = BeautifulSoup(rdf_file, 'html.parser')
process(html_soup, rdf_soup, output_file)
html_file.close()
rdf_file.close()
comp = root + "/" + f
html_docs.append(comp)
# process_file(html_docs[0], html_path, rdf_path, output_path)
executor.map(lambda path: process_file(path, html_path, rdf_path, output_path), html_docs)
elif os.path.isfile(html_path) and os.path.isfile(rdf_path):
......
......@@ -123,37 +123,40 @@ end of this section.
abbreviated as \quote{G}.}~$\cdot$~Hertz}. Similarly, \quote{Pa} has two possible meanings --
\quote{Petayear} and \quote{Pascal\footnote{Pascal is a unit of pressure, commonly abbreviated as \quote{Pa}.}}.
Additionally, it is not always clear to which part of the expression exponents refer to.
In Example 4 from Table~\ref{tab:complexcat}, $0.5$ eV/Å${}^{3}$, the exponent might refer
only to Ångström, but also to the whole expression. Hence we have the two possible meanings
$0.5 \; \text{eV/(Å}^3\text{)}$ and $0.5 \; \text{(eV/Å)}^3$. Given the context of the paper,
In Example 4 from Table~\ref{tab:complexcat}, \quote{$0.5$ eV/Å${}^{3}$}, the exponent might refer
only to Ångström\footnote{Ångström is a unit of length ($1 \; \rm \text{\AA} = 10^{-10} \; m$).}.,
but also to the whole expression. Hence we have the two possible meanings
\quote{$0.5 \; \text{eV/(Å}^3\text{)}$} and \quote{$0.5 \; \text{(eV/Å)}^3$}. Given the context of the paper,
we see that the former was meant in this case.
In the rendered result, the changes between text and math mode do not complicate the understanding of the
expressions for humans.
But when looking at the LaTeX source, we observe that this add a lot of noise to the data.
People tend to misuse text mode to ensure an upright font for units, instead of using the
\verb|\rm| command. This leads to somewhat surprising encodings like \verb|$100$| km s\verb|${}^{-1}$| Mpc
\verb|${}^{-1}$|, 10\verb|$^{-4}$|M\verb|$_\odot$| and especially
3.72 \verb|$\times$| 10 \verb|${}^{10}$| cm \verb|${}^{-2}$|.
\verb|\rm| command. This leads to somewhat surprising encodings like \\
``\verb|$100$| km s\verb|${}^{-1}$| Mpc \verb|${}^{-1}$|'', \\
``10\verb|$^{-4}$|M\verb|$_\odot$|'' and especially \\
``3.72 \verb|$\times$| 10 \verb|${}^{10}$| cm \verb|${}^{-2}$|''. \\
Example 1 from Table~\ref{tab:complexcat} is also a remarkable case, because here,
we see a difference between the semantics of the LaTeX
source and the rendered result.
From its source, we can assume the meaning $(\text{W}/\text{cm})^2$, while the presentational part
intends $\text{W}/(\text{cm}^2)$. The latter is correct here, because the expression describes
From its source, we can assume the meaning \quote{$\rm (W/cm)^2$}, while the presentational part
seems more consistent with \quote{$\rm W/(cm^2)$}. The latter is correct here, because the expression describes
the intensity of a laser beam.
\subsection{Restrictions for this Thesis}
For the further part of the thesis, we restrict our attention to the
The detection of all kinds of quantity expressions and units in STEM documents is a task
which exceeds the scope of this thesis and the author thus has to restrict his
attention to only a part of this problem.
We further investigate the detection of quantity expressions from the
categories in the Tables~\ref{tab:simpmutcat} to~\ref{tab:superscriptcat}.
We omit textual quantity expressions, because they
We omit textual quantity expressions and single units, because they
hardly occur in the documents and because they need to be handled
differently. We also omit unit products and range expressions, due the additional
complexity of handling their semantics, as well expressions involving constants which
would require to also detect or infer the definition of the constants.
The detection and processing of single units (Table~\ref{tab:onlyunitcat}) is also out of the scope of
this thesis.
% The different kinds of units and quantity expressions require
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment