diff --git a/timeline/week18.txt b/timeline/week18.txt index aef7f6fad8f54528e4cef697aeff85b1703de3a7..2188172071fbc4d32b913acea612b3fb1e3fccf4 100644 --- a/timeline/week18.txt +++ b/timeline/week18.txt @@ -37,6 +37,10 @@ Week 18 (27.04.-03.05.) characters; the lowest of the range is 0xa0, which is below 0x7c (the pipe symbol) + -> the IRIs in the Isabelle exports have the following + characters which are (according to automated tool + `iriok' [4]) not valid: '|', '\' and ' ' + -> importing the files w/ fixed IRIs I get new errors from graphDB (sigh) @@ -63,3 +67,4 @@ References [1] https://www.w3.org/TR/2014/REC-rdf11-concepts-20140225/ [2] https://tools.ietf.org/html/rfc3986 [3] https://tools.ietf.org/html/rfc3987 +[4] https://gitlab.cs.fau.de/kissen/iriok diff --git a/ulo/extract-iris.sh b/ulo/extract-iris.sh new file mode 100755 index 0000000000000000000000000000000000000000..cd42f55990199b88faa58b7c8001489c6138e42d --- /dev/null +++ b/ulo/extract-iris.sh @@ -0,0 +1,8 @@ +#! /bin/sh + +# extract iris from rdf files passed on stdin; really it just returns all +# quoted strings + +set -eu + +grep -P -o '".*"' | sed 's/"//g' diff --git a/ulo/isabelle-prepare.sh b/ulo/isabelle-prepare.sh new file mode 100755 index 0000000000000000000000000000000000000000..94591094011382737d6c2886bd0b3f8c6c78c6b8 --- /dev/null +++ b/ulo/isabelle-prepare.sh @@ -0,0 +1,30 @@ +#! /bin/sh + +set -eu + +# prepare DIRECTORY for import w/ the graphdb web +# interface; this is for our prototype, I can't imagine +# we'll use the web interface in production +# +# this script is a fork of xz-to-gz.sh, found in the +# same directory + +if [ ! $# -eq 1 ]; then + echo "usage: $0 DIRECTORY" 1>&2 + exit 1 +fi + +directory="$1" +files=$(find "$directory" -name "*.xz") + +for file in $files; do + echo "$file" 1>&2 + + # the filename of the extracted (uncompressed) file + rdf_file=$(echo "$file" | sed 's/\.xz//') + + # uncompress, fix iris, compress again + unxz "$file" + sed -i 's/|/%7C/g' "$rdf_file" + gzip "$rdf_file" +done