From 88f1d15d9f430818d7a707558203476f21b4bbed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Sch=C3=A4rtl?= <andreas@schaertl.me> Date: Mon, 17 Aug 2020 14:33:05 +0200 Subject: [PATCH] report: add section on collecter and importer --- doc/report/implementation.tex | 104 +++++++++++++++++++++++++++------- doc/report/references.bib | 67 ++++++++++++++++++++++ 2 files changed, 149 insertions(+), 22 deletions(-) diff --git a/doc/report/implementation.tex b/doc/report/implementation.tex index 83ca3a5..ef3b5fc 100644 --- a/doc/report/implementation.tex +++ b/doc/report/implementation.tex @@ -31,9 +31,6 @@ implemented components and their relationships. use in this project, the GraphDB~\cite{graphdb} triplet store was a natural fit. - For this project, both Collecter and Importer ended up being one - piece of monolithic software, but this does not have to be the case. - \item Finally, with all triplets stored in a database, an \emph{Endpoint} is where applications access the underlying knowledge base. This does not necessarily need to be any custom @@ -54,25 +51,88 @@ take a look at the actual implementation created for \subsection{Collecter and Importer}\label{sec:collecter} -(1) Collecter and Importer are one piece of Golang software - -(2) Collecter has different sources: File system and Git -repositories. Easy to extend to HTTP downloads or other sources. We -expect XML data, but different formats exist. - -(3) Collecter forwards streams to Importer which imports data into -{GraphDB}. Importer speaks GraphDB HTTP protocol for importing files. - -(4) GraphDB was chosen because of {REASONS}. - -(5) Import jobs not to be scheduled. Everything is handled by a -collectlib. (TODO: can we do everything w/o web interface?) - -(6) On top of command line utilities we provide a web user interface -for managing these import jobs. - -(7) Golang proved to be a good choice for this task as really -it's just about sending stuff around on the web. +We previously described Collecter and Importer as two distinct +components. The Collecter pulls RDF data from various sources as an +input and outputs a stream of standardized RDF data while the Importer +takes such a stream of RDF data and then dumps it to some sort of +persistent storage. However in the implementation for +\emph{ulo-storage}, both Collecter and Importer ended up being one +piece of monolithic software. This does not need to be the case but +simply proved convenient. + +Our implementation supports two sources for RDF files, namely Git +repositories and the local file system. The file system Collecter +simply crawls a given directory on the local machine and looks for +RDF~XMl~files~\cite{rdfxml} while the Git Collecter first clones a Git +repository and then passes the checked out working copy to the file +system Collecter. Because it is not uncommon for RDF files to be +compressed, our Collecter supports on the fly extraction of +Gzip~\cite{gzip} and XZ~\cite{xz} formats which can greatly reduce the +required disk space in the collection step. + +During development of the Collecter, we found that existing exports +from third party mathematical libraries contain RDF syntax errors which +were not discovered previously. In particular, both Isabelle and Coq +export contained URIs which do not fit the official +specification~\cite{rfc3986}. Previous work that processed Coq and +Isabelle exports used database software such as Virtuoso Open +Source~\cite{ulo} which does not properly check URIs according to +spec, in consequence these faults were only discovered now. To tackle +these problems, we introduced on the fly correction steps during +collection that take the broken RDF files, fix the mentioned problems +related to URIs (by escaping illegal characters) and then +continue processing. Of course this is only a work-around; related +bugs were filed in the respective export projects to ensure that in the +future this extra step is not necessary. + +Our Collecter takes existing RDF files, applies some on the fly +transformations (extraction of compressed files, fixing of errors), +the result is a stream of RDF data. This stream gets passed to the +Importer which imports the encoded RDF triplets into some kind of +persistent storage. The canonical choice for this task is to use a +triple store, that is a database optimized for storing RDF +triplets~\cite{triponto, tripw3c}. For our project, we used the +GraphDB~\cite{graphdb} triple store as it is easy to use an a free +version that fits our needs is available~\cite{graphdbfree}. The +import itself is straight-forward, our software only needs to upload +the RDF file stream as-is to an HTTP endpoint provided by our GraphDB +instance. + +\subsubsection{Scheduling and Version Management} + +Collecter and Importer were implemented as library code that can be +called in various front ends. For this project, we provide both a +command line interface as well as a graphical web front end. While the +command line interface is only useful for manually starting single +jobs, the web interface allows scheduling of jobs. In particular, it +allows the user to automate import jobs. For example, it is possible +to schedule an import of a given Git repository every seven days to a +given GraphDB instance. + +Automated job control alone however do not solve the problem of +versioning. ULO exports~$\mathcal{E}$ depend on an original third +party library~$\mathcal{L}$. Running~$\mathcal{E}$ through the +workflow of Collecter and Importer, we get some database +representation~$\mathcal{D}$. We see that data flows +\begin{align*} + \mathcal{L} \rightarrow \mathcal{E} \rightarrow \mathcal{D} +\end{align*} +which means that if records in~$\mathcal{L}$ change, this will +probably result in different triplets~$\mathcal{E}$ which in turn +results in a need to update~$\mathcal{D}$. This is difficult. As it +stands, \emph{ulo-storage} only knows about what is in~$\mathcal{E}$. +While it should be possible to find out the difference between a new +version of~$\mathcal{E}$ and the current version of~$\mathcal{D}$ and +compute the changes necessary to be applied to~$\mathcal{D}$, the big +number of triplets makes this appear unfeasible. So far, our only +suggestion to solve the problem of changing third party libraries is +to regularly re-create the full data set~$\mathcal{D}$ from scratch, +say every seven days. This circumvents all problems related to +updating existing data sets, but it does mean additional computation +requirements. For currently existing exports from Coq and Isabelle +this is not a problem as even on weak laptop hardware the imports take +less than an hour. But if the number of triplets raises by orders of +magnitude, this approach will eventually not be scalable anymore. \subsection{Endpoints}\label{sec:endpoints} diff --git a/doc/report/references.bib b/doc/report/references.bib index 4a787fc..e66788b 100644 --- a/doc/report/references.bib +++ b/doc/report/references.bib @@ -246,3 +246,70 @@ url = {https://www.cs.helsinki.fi/u/mjarvisa/papers/jarvisalo-matsliah-nordstrom-zivny.cp12.pdf}, urldate = {2020-07-30} } + +@online{rdfxml, + title = {RDF 1.1 XML Syntax}, + organization = {W3C}, + date = {2014}, + urldate = {2020-08-17}, + url = {https://www.w3.org/TR/rdf-syntax-grammar/}, +} + +@techreport{gzip, + title={GZIP file format specification version 4.3}, + author={Deutsch, Peter and others}, + year={1996}, + institution={RFC 1952, May} +} + +@misc{xz, + title={The. xz file format}, + author={Collin, L and Pavlov, I}, + year={2009}, + urldate = {2020-08-17}, + url = {https://tukaani.org/xz/xz-file-format.txt}, +} + +@misc{rfc3986, + series = {Request for Comments}, + number = 3986, + howpublished = {RFC 3986}, + publisher = {RFC Editor}, + doi = {10.17487/RFC3986}, + url = {https://rfc-editor.org/rfc/rfc3986.txt}, + author = {Tim Berners-Lee and Roy T. Fielding and Larry M Masinter}, + title = {{Uniform Resource Identifier (URI): Generic Syntax}}, + pagetotal = 61, + year = 2005, + month = jan, + urldate = {2020-08-17}, +} + +@misc{wikivirtuoso, + title={Virtuoso Open-Source Edition}, + author={Wiki, Virtuoso Open-Source} +} + +@online{tripw3c, + title = {Triple Store}, + organization = {W3C}, + date = {2001}, + urldate = {2020-08-17}, + url = {https://www.w3.org/2001/sw/Europe/events/20031113-storage/positions/rusher.html}, +} + +@online{triponto, + title = {What is RDF Triplestore?}, + organization = {Ontotext}, + date = {2020}, + urldate = {2020-08-17}, + url = {https://www.ontotext.com/knowledgehub/fundamentals/what-is-rdf-triplestore/}, +} + +@online{graphdbfree, + title = {GraphDB Feature Comparison}, + organization = {Ontotext}, + date = {2020}, + urldate = {2020-08-17}, + url = {http://graphdb.ontotext.com/documentation/free/graphdb-feature-comparison.html}, +} \ No newline at end of file -- GitLab