import sys import string import re import util import rdfstore S, P, O, C = 0, 1, 2, 3 V, T = 0, 1 URI = 'tag:infomesh.net,2001-08-07:URI' LIT = 'tag:infomesh.net,2001-08-07:Literal' ANON = 'tag:infomesh.net,2001-08-07:Anon' RC = 'tag:infomesh.net,2001-08-07:RootContext' CONT = 'tag:infomesh.net,2001-08-07:Context' # For extensibility class NTriples(rdfstore.RDFStore): """An NTriples document. cf. http://www.w3.org/TR/2001/WD-rdf-testcases-20010912/#ntriples""" def __init__(self): self.document = None rdfstore.RDFStore.__init__(self, data=[]) # self.quads = rdfstore.RDFStore.quads self.t = r'(<[^>]+>|_:[^\s]+|\"(?:\\\"|[^"])*\")' self.eol = r'[ \t]*.[ \t]*' self.rt = r'[ \t]*'+self.t+r'[ \t]+'+self.t+r'[ \t]+'+self.t+self.eol self.regt = re.compile(self.rt, re.S) self.comment = r'([ \t]*\#[^\n])|([ \t]+)' self.regc = re.compile(self.comment, re.S) def parsen(self, fn): """Parse an NTriples document passed as a file name.""" f = open(fn, 'r'); self.parse(f); f.close() def parse(self, f): self.document = f.read() # Read the file into self.document self.normnl() # Normalize the new lines in self.document self.parsent() # Parse self.document into self.quads def normnl(self): """Normalizes the newlines within a document.""" if len(self.document) == 0: raise 'Document has no content' else: self.document = string.replace(self.document, '\r\n', '\n') self.document = string.replace(self.document, '\r', '\n') def parsent(self): """Parse the document into self.quads""" if '\n' in self.document: lines = string.split(self.document, '\n') else: lines = [self.document] self.CurrentContext = util.generatecontext() self.rand = util.generateint() # print 'document: '+self.document # print 'lines: '+str(lines) # print 'len(lines)'+str(len(lines)) for line in lines: if len(line) == 0: continue # line has no content (a double '\n') self.appendq(line) def appendq(self, line): """Parses and validates a line, and then adds it to self.quads""" # print str(line) if self.regt.match(line): terms = self.regt.findall(line)[0] for term in terms: if term[0] == '<' and term[-1] == '>': # Term is a URI-view self.CurrentQuad.append((term[1:-1], URI)) elif term[:2] == '_:': # Term is an unlabelled node bnode = term[2:] # This filters out incorrect bNodes! CWM produced these... if re.compile(r'[A-Za-z][A-Za-z0-9]*', re.S).match(bnode): # Check the bnode isn't already used in a different context bnode = self.correctbnode(bnode) self.bnodes.append((bnode, self.CurrentContext)) self.CurrentQuad.append((bnode, ANON)) else: raise 'bnode: "'+str(term[2:])+'" is not a valid bnode' elif term[0] == '"' and term[-1] == '"': self.CurrentQuad.append((util.unescape(term[1:-1]), LIT)) else: raise 'Term '+str(term)+' is not a valid NTriples term.' self.CurrentQuad.append((self.CurrentContext, RC)) self.add(self.CurrentQuad) self.CurrentQuad = [] # Reset the current quad! elif self.regc.match(line): pass # Line is just whitespace, or is a comment else: raise 'line: "'+line+'" isn\'t fine' # Validity error! def correctbnode(self, bnode): """This checks to see if a bnode is already in the store under a different root context, and if so modifies the bnode recursively until it fits.""" add = 0 # This flags whether we should modify the bnode for ac in self.bnodes: if ac[0] == bnode and ac[1] != self.CurrentContext: add = 1 else: continue if add == 1: bnode = bnode+self.rand # Must be consistent! bnode = self.correctbnode(bnode) else: pass return bnode def validaten(self, fn): """Parse an NTriples document passed as a file name.""" f = open(fn, 'r'); self.validate(f); f.close() def validate(self, f): self.document = f.read() # Read the file into self.document self.normnl(); self.validatent() def validatent(self): """Validate the document""" if '\n' in self.document: lines = string.split(self.document, '\n') else: lines = self.document for line in lines: if len(line) == 0: continue if self.regt.match(line): terms = self.regt.findall(line)[0] for term in terms: if term[:2] == '_:': bnode = term[2:] if re.compile(r'[A-Za-z][A-Za-z0-9]*', re.S).match(bnode): pass else: raise 'bnode: "'+str(term[2:])+'" is not a valid bnode' else: raise 'Term '+str(term)+' is not a valid NTriples term.' elif self.regc.match(line): pass else: raise 'line: "'+line+'" isn\'t fine' print self.document def run(): x = NTriples() x.parsen(sys.argv[1]) sys.argv.append('') if sys.argv[2] == '-xrdf': print x.xrdf() else: x.printquads() # Main program if __name__ == "__main__": run() # Phew