#!/usr/bin/python """XML Enriched N-Triples Parser. Sean B. Palmer, 2003-06. GPL 2.""" import sys, urllib, cStringIO, xml.sax, xml.sax.handler try: from uripath import join as urijoin except ImportError: from urlparse import urljoin as urijoin xent_ns = "@@" class XENTParser(dict, xml.sax.handler.ContentHandler): def __init__(self, sink, base=None): self.stack = [] self.sink = sink self.base = None self.next = 0 self.s = '' def uri(self, u): return '<%s>' % urijoin(self.base, u) def qname(self, q): pfx, n = q.split(':') return self.uri(self[pfx][-1] + n) def term(self, t=None): if self.s: if `self.s`[1:].startswith("'"): t = `self.s`[2:-1] else: t = `self.s`[2:-1].replace("\\'", "'") self.s, t = '', '"%s"' % self.s.replace('"', '\\"') elif t.startswith('$'): t = '_:' + t[1:] elif t.startswith("'"): t = self.uri(t[1:]) else: t = self.qname(t) self.triple[self.next] = t self.next += 1 if self.next == 3 and self.stack[-1:] == ['properties']: self.sink.triple(self.triple[0], self.triple[1], self.triple[2]) del self.triple[1], self.triple[2] self.next = 1 elif self.next == 3 and self.stack[-1:] == ['objects']: self.sink.triple(self.triple[0], self.triple[1], self.triple[2]) del self.triple[2] self.next = 2 def startPrefixMapping(self, pfx, uri): self.setdefault(pfx, []).append(uri) def endPrefixMapping(self, pfx): self[pfx].pop() def startElementNS(self, name, qname, attrs): if (name[0] == xent_ns): self.stack += [name[1]] if name == (xent_ns, 't'): self.next = 0 self.triple = {} def characters(self, chars): if not (chars and (self.stack[-1:] == ['s'])): for term in [t.strip() for t in chars.split(' ') if t.strip()]: self.term(term) else: self.s += chars def endElementNS(self, name, qname): if name[0] == xent_ns: self.stack.pop() if name == (xent_ns, 's'): self.term() elif name == (xent_ns, 't') and len(self.triple) == 3: self.sink.triple(self.triple[0], self.triple[1], self.triple[2]) class Sink(object): def triple(self, s, p, o): print ("%s %s %s ." % (s, p, o)).encode('utf-8') def parseXENT(s, base=None, sink=None): sink = sink or Sink() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(XENTParser(sink, base)) parser.parse(cStringIO.StringIO(s)) return sink def parseURI(uri): return parseXENT(urllib.urlopen(uri).read(), base=uri) if __name__=="__main__": if len(sys.argv) != 2: print __doc__ else: parseURI(sys.argv[1])