From 88f1d15d9f430818d7a707558203476f21b4bbed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Sch=C3=A4rtl?= <andreas@schaertl.me>
Date: Mon, 17 Aug 2020 14:33:05 +0200
Subject: [PATCH] report: add section on collecter and importer

---
 doc/report/implementation.tex | 104 +++++++++++++++++++++++++++-------
 doc/report/references.bib     |  67 ++++++++++++++++++++++
 2 files changed, 149 insertions(+), 22 deletions(-)

diff --git a/doc/report/implementation.tex b/doc/report/implementation.tex
index 83ca3a5..ef3b5fc 100644
--- a/doc/report/implementation.tex
+++ b/doc/report/implementation.tex
@@ -31,9 +31,6 @@ implemented components and their relationships.
   use in this project, the GraphDB~\cite{graphdb} triplet store was
   a natural fit.
 
-  For this project, both Collecter and Importer ended up being one
-  piece of monolithic software, but this does not have to be the case.
-
 \item Finally, with all triplets stored in a database, an
   \emph{Endpoint} is where applications access the underlying
   knowledge base. This does not necessarily need to be any custom
@@ -54,25 +51,88 @@ take a look at the actual implementation created for
 
 \subsection{Collecter and Importer}\label{sec:collecter}
 
-(1) Collecter and Importer are one piece of Golang software
-
-(2) Collecter has different sources: File system and Git
-repositories. Easy to extend to HTTP downloads or other sources. We
-expect XML data, but different formats exist.
-
-(3) Collecter forwards streams to Importer which imports data into
-{GraphDB}. Importer speaks GraphDB HTTP protocol for importing files.
-
-(4) GraphDB was chosen because of {REASONS}.
-
-(5) Import jobs not to be scheduled. Everything is handled by a
-collectlib. (TODO: can we do everything w/o web interface?)
-
-(6) On top of command line utilities we provide a web user interface
-for managing these import jobs.
-
-(7) Golang proved to be a good choice for this task as really
-it's just about sending stuff around on the web.
+We previously described Collecter and Importer as two distinct
+components.  The Collecter pulls RDF data from various sources as an
+input and outputs a stream of standardized RDF data while the Importer
+takes such a stream of RDF data and then dumps it to some sort of
+persistent storage.  However in the implementation for
+\emph{ulo-storage}, both Collecter and Importer ended up being one
+piece of monolithic software. This does not need to be the case but
+simply proved convenient.
+
+Our implementation supports two sources for RDF files, namely Git
+repositories and the local file system. The file system Collecter
+simply crawls a given directory on the local machine and looks for
+RDF~XMl~files~\cite{rdfxml} while the Git Collecter first clones a Git
+repository and then passes the checked out working copy to the file
+system Collecter. Because it is not uncommon for RDF files to be
+compressed, our Collecter supports on the fly extraction of
+Gzip~\cite{gzip} and XZ~\cite{xz} formats which can greatly reduce the
+required disk space in the collection step.
+
+During development of the Collecter, we found that existing exports
+from third party mathematical libraries contain RDF syntax errors which
+were not discovered previously. In particular, both Isabelle and Coq
+export contained URIs which do not fit the official
+specification~\cite{rfc3986}. Previous work that processed Coq and
+Isabelle exports used database software such as Virtuoso Open
+Source~\cite{ulo} which does not properly check URIs according to
+spec, in consequence these faults were only discovered now. To tackle
+these problems, we introduced on the fly correction steps during
+collection that take the broken RDF files, fix the mentioned problems
+related to URIs (by escaping illegal characters) and then
+continue processing. Of course this is only a work-around; related
+bugs were filed in the respective export projects to ensure that in the
+future this extra step is not necessary.
+
+Our Collecter takes existing RDF files, applies some on the fly
+transformations (extraction of compressed files, fixing of errors),
+the result is a stream of RDF data. This stream gets passed to the
+Importer which imports the encoded RDF triplets into some kind of
+persistent storage. The canonical choice for this task is to use a
+triple store, that is a database optimized for storing RDF
+triplets~\cite{triponto, tripw3c}. For our project, we used the
+GraphDB~\cite{graphdb} triple store as it is easy to use an a free
+version that fits our needs is available~\cite{graphdbfree}. The
+import itself is straight-forward, our software only needs to upload
+the RDF file stream as-is to an HTTP endpoint provided by our GraphDB
+instance.
+
+\subsubsection{Scheduling and Version Management}
+
+Collecter and Importer were implemented as library code that can be
+called in various front ends. For this project, we provide both a
+command line interface as well as a graphical web front end. While the
+command line interface is only useful for manually starting single
+jobs, the web interface allows scheduling of jobs. In particular, it
+allows the user to automate import jobs. For example, it is possible
+to schedule an import of a given Git repository every seven days to a
+given GraphDB instance.
+
+Automated job control alone however do not solve the problem of
+versioning. ULO exports~$\mathcal{E}$ depend on an original third
+party library~$\mathcal{L}$. Running~$\mathcal{E}$ through the
+workflow of Collecter and Importer, we get some database
+representation~$\mathcal{D}$. We see that data flows
+\begin{align*}
+  \mathcal{L} \rightarrow \mathcal{E} \rightarrow \mathcal{D}
+\end{align*}
+which means that if records in~$\mathcal{L}$ change, this will
+probably result in different triplets~$\mathcal{E}$ which in turn
+results in a need to update~$\mathcal{D}$. This is difficult.  As it
+stands, \emph{ulo-storage} only knows about what is in~$\mathcal{E}$.
+While it should be possible to find out the difference between a new
+version of~$\mathcal{E}$ and the current version of~$\mathcal{D}$ and
+compute the changes necessary to be applied to~$\mathcal{D}$, the big
+number of triplets makes this appear unfeasible. So far, our only
+suggestion to solve the problem of changing third party libraries is
+to regularly re-create the full data set~$\mathcal{D}$ from scratch,
+say every seven days. This circumvents all problems related to
+updating existing data sets, but it does mean additional computation
+requirements. For currently existing exports from Coq and Isabelle
+this is not a problem as even on weak laptop hardware the imports take
+less than an hour. But if the number of triplets raises by orders of
+magnitude, this approach will eventually not be scalable anymore.
 
 \subsection{Endpoints}\label{sec:endpoints}
 
diff --git a/doc/report/references.bib b/doc/report/references.bib
index 4a787fc..e66788b 100644
--- a/doc/report/references.bib
+++ b/doc/report/references.bib
@@ -246,3 +246,70 @@
     url = {https://www.cs.helsinki.fi/u/mjarvisa/papers/jarvisalo-matsliah-nordstrom-zivny.cp12.pdf},
     urldate = {2020-07-30}
 }
+
+@online{rdfxml,
+    title = {RDF 1.1 XML Syntax},
+    organization = {W3C},
+    date = {2014},
+    urldate = {2020-08-17},
+    url = {https://www.w3.org/TR/rdf-syntax-grammar/},
+}
+
+@techreport{gzip,
+  title={GZIP file format specification version 4.3},
+  author={Deutsch, Peter and others},
+  year={1996},
+  institution={RFC 1952, May}
+}
+
+@misc{xz,
+  title={The. xz file format},
+  author={Collin, L and Pavlov, I},
+  year={2009},
+  urldate = {2020-08-17},
+  url = {https://tukaani.org/xz/xz-file-format.txt},
+}
+
+@misc{rfc3986,
+  series =	{Request for Comments},
+  number =	3986,
+  howpublished =	{RFC 3986},
+  publisher =	{RFC Editor},
+  doi =		{10.17487/RFC3986},
+  url =		{https://rfc-editor.org/rfc/rfc3986.txt},
+  author =	{Tim Berners-Lee and Roy T. Fielding and Larry M Masinter},
+  title =		{{Uniform Resource Identifier (URI): Generic Syntax}},
+  pagetotal =	61,
+  year =		2005,
+  month =		jan,
+  urldate = {2020-08-17},
+}
+
+@misc{wikivirtuoso,
+  title={Virtuoso Open-Source Edition},
+  author={Wiki, Virtuoso Open-Source}
+}
+
+@online{tripw3c,
+    title = {Triple Store},
+    organization = {W3C},
+    date = {2001},
+    urldate = {2020-08-17},
+    url = {https://www.w3.org/2001/sw/Europe/events/20031113-storage/positions/rusher.html},
+}
+
+@online{triponto,
+    title = {What is RDF Triplestore?},
+    organization = {Ontotext},
+    date = {2020},
+    urldate = {2020-08-17},
+    url = {https://www.ontotext.com/knowledgehub/fundamentals/what-is-rdf-triplestore/},
+}
+
+@online{graphdbfree,
+    title = {GraphDB Feature Comparison},
+    organization = {Ontotext},
+    date = {2020},
+    urldate = {2020-08-17},
+    url = {http://graphdb.ontotext.com/documentation/free/graphdb-feature-comparison.html},
+}
\ No newline at end of file
-- 
GitLab