diff --git a/ulocollect/core/iri_fix_importer.go b/ulocollect/core/iri_fix_importer.go new file mode 100644 index 0000000000000000000000000000000000000000..0a7c362fc8b82bb03706e839843235392e719c7c --- /dev/null +++ b/ulocollect/core/iri_fix_importer.go @@ -0,0 +1,109 @@ +package core + +import ( + "bufio" + "fmt" + "io" + "log" +) + +// Middleware Importer that (1) fixes all IRIs by escaping them quite +// aggressively and then (2) forward them all to another Importer. +type IriFixImporter struct { + Next Importer +} + +// Implements io.Reader for use in IriFixImporter. +type irifixreader struct { + // We read bytes from this Reader. + source io.Reader + + // We put converted bytes into this ch. + ch chan byte +} + +func (ifi IriFixImporter) Import(rdf io.Reader) error { + proxy := &irifixreader{ + source: rdf, + ch: make(chan byte, 1024), + } + + go proxy.writeToChan() + return ifi.Next.Import(proxy) +} + +// Implement io.Reader +func (ifr *irifixreader) Read(p []byte) (nbytes int, err error) { + for nbytes = 0; nbytes < len(p); nbytes += 1 { + if b, ok := <-ifr.ch; !ok { + break + } else { + p[nbytes] = b + } + } + + if nbytes == 0 { + return 0, io.EOF + } + + return nbytes, nil +} + +// Fill ifr.ch with with a fixed version of ifr.source. +func (ifr *irifixreader) writeToChan() { + br := bufio.NewReader(ifr.source) + + insideQuotes := false + + for { + r, _, err := br.ReadRune() + + // error handling + + if err == io.EOF { + close(ifr.ch) + break + } + + if err != nil { + close(ifr.ch) + log.Print(err) + break + } + + // state machine + + if r == '"' { + insideQuotes = !insideQuotes + } + + var bs []byte + + if insideQuotes { + bs = ifr.fixed(r) + } else { + bs = []byte(string(r)) + } + + // Write out to channel (where it will eventually + // be consumed by Read). + + for _, b := range bs { + ifr.ch <- b + } + } +} + +func (ifr *irifixreader) fixed(r rune) []byte { + bads := []rune{ + '|', '\n', ' ', '^', '\\', + } + + for _, bad := range bads { + if r == bad { + return []byte(fmt.Sprintf("%%%X", r)) + } + } + + return []byte(string(r)) +} diff --git a/ulocollect/core/iri_fix_importer_test.go b/ulocollect/core/iri_fix_importer_test.go new file mode 100644 index 0000000000000000000000000000000000000000..50c2ac26b9d964e53485ed3f18855c54643b4cef --- /dev/null +++ b/ulocollect/core/iri_fix_importer_test.go @@ -0,0 +1,90 @@ +package core + +import ( + "bytes" + "io" + "io/ioutil" + "testing" +) + +// Importer that stores all imported bytes to buffer sink. +type stringimporter struct { + sink []byte +} + +func (sim *stringimporter) Import(rdf io.Reader) error { + if bs, err := ioutil.ReadAll(rdf); err != nil { + return err + } else { + sim.sink = bs + return nil + } +} + +func (sim *stringimporter) Sink() string { + return string(sim.sink) +} + +func TestZeroQuotes(t *testing.T) { + iri := `abcdefghijklmnopqrstuvwxzy` + start := bytes.NewBufferString(iri) + + sim := stringimporter{} + + ifim := IriFixImporter{ + Next: &sim, + } + + if err := ifim.Import(start); err != nil { + t.Errorf("import failed: %v", err) + } + + imported := sim.Sink() + + if iri != imported { + t.Errorf("iri=%v and imported=%v do not match", iri, imported) + } +} + +func TestZeroChange(t *testing.T) { + iri := `abcdefghijklm"nopqrs"tuvwxzy` + start := bytes.NewBufferString(iri) + + sim := stringimporter{} + + ifim := IriFixImporter{ + Next: &sim, + } + + if err := ifim.Import(start); err != nil { + t.Errorf("import failed: %v", err) + } + + imported := sim.Sink() + + if iri != imported { + t.Errorf("iri=%v and imported=%v do not match", iri, imported) + } +} + +func TestPipe(t *testing.T) { + iri := `abcdefghijklm"no|pqrs"tuvwxzy` + escaped := `abcdefghijklm"no%7Cpqrs"tuvwxzy` + start := bytes.NewBufferString(iri) + + sim := stringimporter{} + + ifim := IriFixImporter{ + Next: &sim, + } + + if err := ifim.Import(start); err != nil { + t.Errorf("import failed: %v", err) + } + + imported := sim.Sink() + + if escaped != imported { + t.Errorf("escaped=%v and imported=%v do not match", escaped, imported) + } +}