diff --git a/ulo/scripts/coq-cut.sh b/ulo/scripts/coq-cut.sh index 1f6ee805e7ed3cafeacd427a4a07a5ff38c2b5b7..c98fac5b975dde53c336a06e41b2dff1527acf61 100755 --- a/ulo/scripts/coq-cut.sh +++ b/ulo/scripts/coq-cut.sh @@ -1,7 +1,7 @@ #! /bin/sh -# cut out some components from the rdf files exported from Coq [1] -# that GraphDB does not like; right now I don't know if they +# Cut out some components from the rdf files exported from Coq [1] +# that GraphDB does not like. Right now I don't know if they # are important or if they can be skipped, but I just want a working # prototype first of all # diff --git a/ulo/scripts/extract-iris.sh b/ulo/scripts/extract-iris.sh index cd42f55990199b88faa58b7c8001489c6138e42d..cc0c0e179390c1b5af1393ccfe697c0e0898af25 100755 --- a/ulo/scripts/extract-iris.sh +++ b/ulo/scripts/extract-iris.sh @@ -1,7 +1,7 @@ #! /bin/sh -# extract iris from rdf files passed on stdin; really it just returns all -# quoted strings +# Extract IRIs from rdf files passed on stdin. Actually just returns +# all quoted strings. set -eu diff --git a/ulo/scripts/fix-rdf-file.py b/ulo/scripts/fix-rdf-file.py index 9bb25adae6f49425c12e1188c95c61232288d659..1071ed18df3f20dc0ea53ab9b3b18b4240863d2f 100755 --- a/ulo/scripts/fix-rdf-file.py +++ b/ulo/scripts/fix-rdf-file.py @@ -1,7 +1,12 @@ #! /usr/bin/env python3 +''' +fix-rdf-file.py: Given an RDF file passed on stdin, fix some errors in +IRIs. This is not a complete solution, but a shim to get a working first +verison. +''' + -from urllib import parse import re import sys @@ -9,10 +14,6 @@ import sys def fix_quoted(s: str) -> str: payload = s.strip('"') - #fixed = parse.quote(payload) - #fixed = fixed.replace('http%3A', 'http:') - #fixed = fixed.replace('https%3A', 'https:') - bad_chars = ( '|', '\\', ' ', '^' ) diff --git a/ulo/scripts/isabelle-prepare-directory.sh b/ulo/scripts/isabelle-prepare-directory.sh new file mode 100755 index 0000000000000000000000000000000000000000..9d76e4aabd25d4399a563e2d51c45b82df82a2e9 --- /dev/null +++ b/ulo/scripts/isabelle-prepare-directory.sh @@ -0,0 +1,24 @@ +#! /bin/sh + +set -eu + +# Prepare DIRECTORY for import w/ the GraphDB web interface. +# +# That is, it searches for *.rdf.xz files in DIRECTORY and passes +# each file to the isabelle-prepare-file.sh script. +# +# If you want to prepare Isabelle exported files for GraphDB, you +# probably should use this script. + +if [ ! $# -eq 1 ]; then + echo "usage: $0 DIRECTORY" 1>&2 + exit 1 +fi + +directory="$1" +threads=6 + +script_dir=$(dirname "$0") +cd "$script_dir" + +find "$directory" -name "*.rdf.xz" -print0 -exec xargs -0 -P $threads ./isabelle-prepare-file.sh {} \; diff --git a/ulo/scripts/isabelle-prepare-file.sh b/ulo/scripts/isabelle-prepare-file.sh new file mode 100755 index 0000000000000000000000000000000000000000..e778f7e85e826c783fb7285296208911f3cdbff4 --- /dev/null +++ b/ulo/scripts/isabelle-prepare-file.sh @@ -0,0 +1,30 @@ +#! /bin/sh + +set -eu + +# Given a single *.rdf.xz FILE, (1) extract that file, (2) apply fixes +# with fix-rdf-file.py and (3) re-compress with gzip as GraphDB only +# supports gzip and not xz. + +if [ ! $# -eq 1 ]; then + echo "usage: $0 FILE" 1>&2 + exit 1 +fi + +script_dir=$(dirname "$0") +cd "$script_dir" + +file="$1" + +echo "$file" 1>&2 + +# the filename of the extracted (uncompressed) file +rdf_file=$(echo "$file" | sed 's/\.xz//') +work_file=$(mktemp) + +# uncompress, fix iris +xzcat "$file" | ./fix-rdf-file.py > "$work_file" +mv "$work_file" "$rdf_file" + +# compress again +gzip "$rdf_file" diff --git a/ulo/scripts/isabelle-prepare.sh b/ulo/scripts/isabelle-prepare.sh deleted file mode 100755 index 5f6e6fa4f67b1c459fa135b8d11558f23a4d2a3c..0000000000000000000000000000000000000000 --- a/ulo/scripts/isabelle-prepare.sh +++ /dev/null @@ -1,36 +0,0 @@ -#! /bin/sh - -set -eu - -# prepare DIRECTORY for import w/ the graphdb web -# interface; this is for our prototype, I can't imagine -# we'll use the web interface in production -# -# this script is a fork of xz-to-gz.sh, found in the -# same directory - -if [ ! $# -eq 1 ]; then - echo "usage: $0 DIRECTORY" 1>&2 - exit 1 -fi - -script_dir=$(dirname "$0") -cd "$script_dir" - -directory="$1" -files=$(find "$directory" -name "*.xz") - -for file in $files; do - echo "$file" 1>&2 - - # the filename of the extracted (uncompressed) file - rdf_file=$(echo "$file" | sed 's/\.xz//') - work_file=$(mktemp) - - # uncompress, fix iris - xzcat "$file" | ./fix-rdf-file.py > "$work_file" - mv "$work_file" "$rdf_file" - - # compress again - gzip "$rdf_file" -done diff --git a/ulo/scripts/build_virtuoso.sh b/ulo/scripts/virtuso/build_virtuoso.sh similarity index 100% rename from ulo/scripts/build_virtuoso.sh rename to ulo/scripts/virtuso/build_virtuoso.sh diff --git a/ulo/scripts/import-recursive-xz-rdf-to-virtuoso.sh b/ulo/scripts/virtuso/import-recursive-xz-rdf-to-virtuoso.sh similarity index 100% rename from ulo/scripts/import-recursive-xz-rdf-to-virtuoso.sh rename to ulo/scripts/virtuso/import-recursive-xz-rdf-to-virtuoso.sh diff --git a/ulo/scripts/xz-to-gz.sh b/ulo/scripts/xz-to-gz.sh deleted file mode 100755 index 668dd51c2e258891767f766fc1163a2c45121d9a..0000000000000000000000000000000000000000 --- a/ulo/scripts/xz-to-gz.sh +++ /dev/null @@ -1,26 +0,0 @@ -#! /bin/sh - -# given a path, traverse that file system tree and extract all -# .xz files and re-compress them to .gz; some tooling, in particular -# graphdb, only supports gz and not xz - -set -eu - -if [ ! $# -eq 1 ]; then - echo "usage: $0 DIRECTORY" 1>&2 - exit 1 -fi - -directory="$1" -files=$(find "$directory" -name "*.xz") - -for file in $files; do - echo "$file" 1>&2 - - # the filename of the extracted (uncompressed) file - rdf_file=$(echo "$file" | sed 's/\.xz//') - - # uncompress and then compress again - unxz "$file" - gzip "$rdf_file" -done