diff --git a/doc/report/Makefile b/doc/report/Makefile index 84a327554915943a5db7e0a3b035b738427eac3f..bf2351705bfa42007f9346d1d824175ca449951a 100644 --- a/doc/report/Makefile +++ b/doc/report/Makefile @@ -1,10 +1,11 @@ TEX_SOURCES = report.tex abstract.tex intro.tex components.tex \ - endpoints.tex applications.tex conclusion.tex + endpoints.tex applications.tex conclusion.tex \ + applications-ulo-table.tex report.pdf: $(TEX_SOURCES) references.bib - chronic pdflatex $< - chronic biber $(basename $<) - chronic pdflatex $< + pdflatex $< + biber $(basename $<) + pdflatex $< view: report.pdf mupdf $< diff --git a/doc/report/applications-ulo-table.tex b/doc/report/applications-ulo-table.tex new file mode 100644 index 0000000000000000000000000000000000000000..debbe3cb148abed535cc0d10cdde38b59193e828 --- /dev/null +++ b/doc/report/applications-ulo-table.tex @@ -0,0 +1,100 @@ +\begin{figure} + \centering + \small + \begin{subfigure}[]{0.9\textwidth} + \texttt{check-time} \texttt{defines} \texttt{definition} + \texttt{derived} \texttt{experimental} + \texttt{external-size} \texttt{function} + \texttt{important} \texttt{inductive-on} + \texttt{instance-of} \texttt{justifies} \texttt{name} + \texttt{para} \texttt{paratype} \texttt{predicate} + \texttt{primitive} \texttt{section} \texttt{sourceref} + \texttt{specified-in} \texttt{specifies} + \texttt{statement} \texttt{theory} \texttt{type} + \texttt{unimportant} \texttt{universe} \texttt{uses} + \caption{ULO predicates used in the Isabelle exports.}\label{fig:used} + \end{subfigure} + \vspace{0.5cm} + + \begin{subfigure}[]{.9\textwidth} + \texttt{action-times} \texttt{aligned-with} + \texttt{alternative-for} \texttt{antonym} + \texttt{automatically-proved} \texttt{axiom} + \texttt{constructs} \texttt{contains} + \texttt{counter-example-for} \texttt{crossrefs} + \texttt{declaration} \texttt{deprecated} \texttt{docref} + \texttt{example} \texttt{example-for} \texttt{file} + \texttt{folder} \texttt{formalizes} \texttt{generated-by} + \texttt{hypernym} \texttt{hyponym} + \texttt{implementation-uses} + \texttt{implementation-uses-implementation-of} + \texttt{implementation-uses-interface-of} \texttt{inspired-by} + \texttt{inter-statement} \texttt{interface-uses} + \texttt{interface-uses-implementation-of} + \texttt{interface-uses-interface-of} \texttt{internal-size} + \texttt{last-checked-at} \texttt{library} + \texttt{library-group} \texttt{logical} + \texttt{mutual-block} \texttt{nyms} + \texttt{organizational} \texttt{phrase} \texttt{physical} + \texttt{proof} \texttt{proposition} \texttt{revision} + \texttt{rule} \texttt{same-as} \texttt{see-also} + \texttt{similar-to} \texttt{size-properties} + \texttt{superseded-by} \texttt{theorem} \texttt{typedec} + \texttt{uses-implementation} \texttt{uses-interface} + \caption{ULO predicates \emph{not} used in the Isabelle exports.}\label{fig:used} + \end{subfigure} + \vspace{0.5cm} + + \begin{subfigure}[]{.9\textwidth} + \texttt{axiom} \texttt{definition} + \texttt{derived} \texttt{example} \texttt{file} + \texttt{folder} \texttt{internal-size} + \texttt{library} \texttt{library-group} + \texttt{predicate} \texttt{primitive} + \texttt{proposition} \texttt{revision} + \texttt{statement} \texttt{theory} \texttt{type} + \texttt{uses} + \caption{ULO predicates used in the Coq exports.}\label{fig:used} + \end{subfigure} + \vspace{0.5cm} + + \begin{subfigure}[]{.9\textwidth} + \texttt{action-times} \texttt{aligned-with} + \texttt{alternative-for} \texttt{antonym} + \texttt{automatically-proved} \texttt{check-time} + \texttt{constructs} \texttt{contains} + \texttt{counter-example-for} \texttt{crossrefs} + \texttt{declaration} \texttt{defines} + \texttt{deprecated} \texttt{docref} + \texttt{example-for} \texttt{experimental} + \texttt{external-size} \texttt{formalizes} + \texttt{function} \texttt{generated-by} + \texttt{hypernym} \texttt{hyponym} + \texttt{implementation-uses} + \texttt{implementation-uses-implementation-of} + \texttt{implementation-uses-interface-of} + \texttt{important} \texttt{inductive-on} + \texttt{inspired-by} \texttt{instance-of} + \texttt{inter-statement} \texttt{interface-uses} + \texttt{interface-uses-implementation-of} + \texttt{interface-uses-interface-of} + \texttt{justifies} \texttt{last-checked-at} + \texttt{logical} \texttt{mutual-block} + \texttt{name} \texttt{nyms} + \texttt{organizational} \texttt{para} + \texttt{paratype} \texttt{phrase} + \texttt{physical} \texttt{proof} \texttt{rule} + \texttt{same-as} \texttt{section} + \texttt{see-also} \texttt{similar-to} + \texttt{size-properties} \texttt{sourceref} + \texttt{specified-in} \texttt{specifies} + \texttt{superseded-by} \texttt{theorem} + \texttt{typedec} \texttt{unimportant} + \texttt{universe} \texttt{uses-implementation} + \texttt{uses-interface} + \caption{ULO predicates \emph{not} used in the Coq exports.}\label{fig:used} + \end{subfigure} + \vspace{0.5cm} + + \caption{ULO predicates in their usage}\label{fig:used} +\end{figure} diff --git a/doc/report/applications.tex b/doc/report/applications.tex index 9728a95d3a2c07bbdfe6b015a4eaebcf419cf160..09f5a181946f8e338cd23d876b7c6862cc92b213 100644 --- a/doc/report/applications.tex +++ b/doc/report/applications.tex @@ -1,9 +1,54 @@ \newpage \section{Applications}\label{sec:applications} -With endpoints in place, we can now query the ULO/RDF -data set. Depending on the kind of application, different interfaces -and approaches to querying the database might make sense. +With endpoints in place, we can now query the ULO/RDF data set. This +section describes some experiments with the \emph{ulo-endpoint} +Endpoint {API}. In particular, we query the storage backend for some +statistics, implement some queries suggested for tetrapodal search + +\subsection{Exploring Existing Data Sets} + +As previously stated, there already exist exports to ULO for both +Isabelle and Coq libraries~\cite{uloisabelle, ulocoq}. As a very first +application, we simply look at what ULO predicates are actually used +by the respective data sets. Implementing such a query is not very +difficult. In SPARQL, this can be achieved with the \texttt{COUNT} +aggregate. +\begin{lstlisting} + PREFIX ulo: <https://mathhub.info/ulo#> + + SELECT ?predicate (COUNT(?predicate) as ?count) + WHERE { + ?s ?predicate ?o . + } + GROUP BY ?predicate + ORDER BY DESC(?count) +\end{lstlisting} +This yields a list of all used predicates with \texttt{?count} being +the number of occurrences. Looking at the results, we find that both +the Isabelle and the Coq data sets only use subsets of the predicates +provided by the ULO ontology. The results are listed in +figure~\ref{fig:used}. In both cases, the exports use less than a +third of the available predicates. + +\input{applications-ulo-table.tex} + +We also see that the Isabelle and Coq exports use different +predicates. For example, the Isabelle contains organizational meta +information such as information about paragraphs and sections in the +source document while the Coq export only tells us about the filename +of the Coq source. That is not particularly problematic as long as we +can trace a given object back to the original Isabelle/Coq source. + +However, our results do show that both exports have their own +particularities and with more and more third party libraries exported +to ULO one can assume that this heterogeneity only grows. In particular +we want to point to the large number of predicates which remain unused +in both Isabelle and Coq exports. A user formulating queries for ULO +might be oblivious to the fact that only subsets of exports support +given predicates. While not a problem for \emph{ulo-storage} per se, +we expect this to be a major challenge when building a system of +tetrapodal search. \subsection{Querying for Tetrapodal Search} @@ -209,8 +254,8 @@ proof of concept implementations. } GROUP BY ?work ORDER BY DESC(?refcount) - \end{lstlisting} - We see that we can formulate the idea behind~$\mathcal{Q}_5$ with - one not very complicated SPARQL query. Because here everything is - handled by the database access should be quick. + \end{lstlisting} + We see that we can formulate the idea behind~$\mathcal{Q}_5$ with + one not very complicated SPARQL query. Because here everything is + handled by the database access should be quick. \end{itemize} diff --git a/doc/report/report.tex b/doc/report/report.tex index 5cf1a731cf31e4e5b096cc5c200a2a3a0ad18bbe..28222aa6f0471c5020b7893d55148aea27c2a891 100644 --- a/doc/report/report.tex +++ b/doc/report/report.tex @@ -1,6 +1,7 @@ \documentclass[a4paper]{scrartcl} \usepackage{amsmath} +\usepackage{booktabs} \usepackage{caption} \usepackage{float} \usepackage{geometry} @@ -10,6 +11,8 @@ \usepackage{listings} \usepackage{multicol} \usepackage{subcaption} +\usepackage{subcaption} +\usepackage{tabularx} \usepackage{verbatim} \usepackage[utf8]{inputenc} \usepackage[ diff --git a/doc/report/report.toc b/doc/report/report.toc deleted file mode 100644 index 541ac01b08afc2c39a9e1fc08a86a377e3ebbfb2..0000000000000000000000000000000000000000 --- a/doc/report/report.toc +++ /dev/null @@ -1,19 +0,0 @@ -\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax -\defcounter {refsection}{0}\relax -\contentsline {section}{\numberline {1}Introduction}{2}{section.1}% -\defcounter {refsection}{0}\relax -\contentsline {section}{\numberline {2}Components}{3}{section.2}% -\defcounter {refsection}{0}\relax -\contentsline {section}{\numberline {3}Collecter and Importer}{5}{section.3}% -\defcounter {refsection}{0}\relax -\contentsline {section}{\numberline {4}Endpoints}{6}{section.4}% -\defcounter {refsection}{0}\relax -\contentsline {subsection}{\numberline {4.1}SPARQL}{6}{subsection.4.1}% -\defcounter {refsection}{0}\relax -\contentsline {subsection}{\numberline {4.2}RDF4J}{6}{subsection.4.2}% -\defcounter {refsection}{0}\relax -\contentsline {section}{\numberline {5}Applications}{7}{section.5}% -\defcounter {refsection}{0}\relax -\contentsline {subsection}{\numberline {5.1}Querying for Tetrapodal Search}{7}{subsection.5.1}% -\defcounter {refsection}{0}\relax -\contentsline {section}{\numberline {6}Conclusion and Future Work}{11}{section.6}% diff --git a/experimental/ulo/seperate.py b/experimental/ulo/seperate.py new file mode 100755 index 0000000000000000000000000000000000000000..c2148bdbecf490184161766c652d2550f5d87909 --- /dev/null +++ b/experimental/ulo/seperate.py @@ -0,0 +1,58 @@ +#! /usr/bin/env python3 + +# +# generate-table.py +# +# Given CSV data passed on stdin with two columns (number of +# occurences, URI), split up the lines in those that have at least one +# occurence and those that do not. +# + + +from typing import Tuple +import csv +import sys + + +def shorten_predicate(uri: str) -> str: + ULO_NAMESPACE = 'https://mathhub.info/ulo#' + DCTERMS_NAMESPACE = 'http://purl.org/dc/terms/' + + uri = uri.replace(ULO_NAMESPACE, 'ulo:') + uri = uri.replace(DCTERMS_NAMESPACE, 'dcterms:') + + return r'\texttt{%s}' % uri + + +def main(): + occupied = [] + unoccupied = [] + + for row in csv.reader(sys.stdin): + if row: + predicate = shorten_predicate(row[1]) + occurence = int(row[0]) + + if occurence == 0: + unoccupied.append(predicate) + else: + occupied.append(predicate) + + print('--------------- OCCUPIED ---------------') + for pred in occupied: + print(pred) + + print('\n------------ UNOCCUPIED ------------') + for pred in unoccupied: + print(pred) + + print('\n------------ STATS ------------') + print('#occupied %d' % len(occupied)) + print('#unoccupied %d' % len(unoccupied)) + + +if __name__ == '__main__': + try: + main() + except (KeyboardInterrupt, SystemExit, BrokenPipeError): + pass