#!/usr/bin/python """n3s - A Notation3 command line preprocessor and editor. Sean B. Palmer , April 2002 Basic usage: python n3s -[pus] -p: pipe in e.g. `cat FileName | python n3s -p` -u: use file (default) e.g. `python n3s -u FileOrURI` or `python n3s FileOrURI` -s: use command line string e.g. `python n3s -s '{ ?x rdfs:label "x" } => { :Test a :Pass }'` Further Usage: python n3s -[td][pus][i[=id]] -t tokenize -d debug -i load a file, and use the stuff declared within For more information: http://infomesh.net/2002/n3s/ Thanks to deltab and Aaron Swartz for their help and suggestions. """ __author__ = 'Sean B. Palmer' __license__ = 'Copyright (C) 2002 Sean B. Palmer. GNU GPL 2' import sys, re, time, random, urllib # Set the following variable to a file name or URI if you # want to use a default file rather than relying on the # "i" command line mode. LocalSetup = None # -- End of configurable stuff. Snip here. Go home, folks -- varspace, default = 'tag:n3s.infomesh.net,%s%s#' % ( time.strftime('%Y-%m-%d:%H%M%S', time.gmtime(time.time())), ''.join([random.choice('0123456789abcdefghijklmnopqrstuvwzyz') for x in ' '*10])), ' ' # More magic stuff that we know about RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' # for 'a' keyword DAML_NS = 'http://www.daml.org/2001/03/daml+oil#' # for '=' keyword LOG_NS = 'http://www.w3.org/2000/10/swap/log#' # for forAll/forSome keywords PREFIXES = { default: '#', '': '#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'daml': 'http://www.daml.org/2001/03/daml+oil#', 'dpo': 'http://www.daml.org/2001/03/daml+oil#', 'log': 'http://www.w3.org/2000/10/swap/log#', 'string': 'http://www.w3.org/2000/10/swap/string#', 'crypto': 'http://www.w3.org/2000/10/swap/crypto#', 'dc': 'http://purl.org/dc/elements/1.1/', 'dct': 'http://purl.org/dc/terms/', 'foaf': 'http://xmlns.com/foaf/0.1/', 'wot': 'http://xmlns.com/wot/0.1/', 'earl': 'http://www.w3.org/2001/03/earl/0.95#', 'doc': 'http://www.w3.org/2000/10/swap/pim/doc#', 'swn': 'http://purl.org/net/swn#', 'a': 'http://www.megginson.com/exp/ns/airports#', 'contact': 'http://www.w3.org/2000/10/swap/pim/contact#', 'i': 'http://www.w3.org/2001/04/infoset#', 'math': 'http://www.w3.org/2000/10/swap/math#', 'os': 'http://www.w3.org/2000/10/swap/os#', 'rcs': 'http://www.w3.org/2001/03swell/rcs#', 'wn': 'http://xmlns.com/wordnet/1.6/', 'v': varspace } KEYWORDS = { 'forall': [LOG_NS, 'forAll'], 'forsome': [LOG_NS, 'forSome'], '=>': [LOG_NS, 'implies'], '=': [DAML_NS, 'equivalentTo'], 'a': [RDF_NS, 'type'], # since not allowed as subject 'this': None, 'is': None, 'of': None } def group(*n): return '(%s)' % '|'.join(n) # These are the basic Notation3 tokens, with some extensions Name = r'[A-Za-z0-9_]+' URI = r'<[^ >]*>' bNode = r'_:' + Name Univar = r'\?' + Name Prefix = r'(?:[A-Za-z][A-Za-z0-9_]*)?:' QName = Prefix + Name Literal = r'"[^"\\]*(?:\\.[^"\\]*)*"' LLiteral = r'"""[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' AtPrefix = r'@prefix' AtUse = r'@use' AtKeyWord = r'@keyword' WS = r'[ \t]' Tokens = group(LLiteral, URI, Literal, AtKeyWord, AtPrefix, ':-', QName, AtUse, bNode, Prefix, Name, Univar, 'is', 'of', '=>', '=', '{', '}', '\(', '\)', '\[', '\]', ',', ';', '\.', WS, '\n') Token = re.compile(Tokens, re.S) # # # # # # # # # # # # # # # # # # # # # # # # # TOKENIZER: General tokenizing functions # def notComment(s): N3Comment = re.compile(r'([ \t]*\#[^\n]*)', re.S) if N3Comment.match(s): return '' else: return s def toke(s, FILTER=1): """Notation3 tokenizer. Takes in a string, returns a raw token list.""" if len(s) == 0: raise 'Document has no content' s = '\n'.join([notComment(line) for line in s.replace('\r\n', '\n').replace('\r', '\n').split('\n')]).strip() if FILTER: return filter(lambda x: x not in list(' \t\n\r'), Token.findall(s)) else: return Token.findall(s) # # # # # # # # # # # # # # # # # # # # # # # # def declarations(tokes): """This parses a list of tokens, mapping a set of keywords to their actual QNames, keywords, or URIs.""" # Get all the prefixes t = time.time() while '@keyword' in tokes: i = tokes.index('@keyword') j = tokes[i:].index('.')+i+1 keywords, tokes = ''.join(tokes[i+1:j-1]).split(','), tokes[:i]+tokes[j:] for k in keywords: if k in KEYWORDS.keys(): pass else: KEYWORDS[k] = None while '@use' in tokes: i = tokes.index('@use') j = tokes[i:].index('.')+i+1 use, tokes = tokes[i:j], tokes[:i]+tokes[j:] for k in ''.join(use[1:-2]).split(','): if k not in KEYWORDS.keys(): KEYWORDS[k] = [use[-2][1:-1], k] while '@prefix' in tokes: i = tokes.index('@prefix') if tokes[i+1] != 'default': PREFIXES[tokes[i+1][:-1]] = tokes[i+2][1:-1] else: PREFIXES[default] = tokes[i+2][1:-1] if tokes[i+3] != '.': raise "Syntax error: "+' '.join(tokes[i:i+5]) tokes = tokes[:i] + tokes[i+4:] return tokes def compat(t): """Takes in a set of declarationless tokens.""" RevPREF, UNIVARS = {}, {} # RevPREF is PREFIXES inside-out for k in PREFIXES.keys(): if k != default: RevPREF[PREFIXES[k]] = k for i in range(len(t)): if ':' in t[i]: pass # it's a QName, so skip it. (for speed) elif t[i] in KEYWORDS.keys(): if KEYWORDS[t[i]] is None: pass elif KEYWORDS[t[i]][0] in RevPREF.keys(): # make QName t[i] = RevPREF[KEYWORDS[t[i]][0]]+':'+KEYWORDS[t[i]][1] else: t[i] = '<'+''.join(KEYWORDS[t[i]])+'>' # make URI-ref elif re.compile('^'+Name+'$').match(t[i]): if default in PREFIXES.keys(): if PREFIXES[default] in RevPREF.keys(): t[i] = RevPREF[PREFIXES[default]]+':'+t[i] # make QName else: t[i] = '<'+PREFIXES[default]+t[i]+'>' # make URI-ref else: raise "Keyword "+t[i]+" not declared" elif re.compile('^'+Univar+'$').match(t[i]): # Congratulations, it's a Univar! Store it in the UNIVARS dictionary if t[i] not in UNIVARS: UNIVARS[t[i]] = None # Now do the replacements - need to check the names if t[i].replace('?', ':') in t: # problem! generate id, store mapping if UNIVARS[t[i]] is None: # we haven't found it before if ('v', varspace) in PREFIXES.items(): # use a QName UNIVARS[t[i]] = t[i].replace('?', 'v:') t[i] = t[i].replace('?', 'v:') else: # the 'v:' prefix has been overwritten, so use a URI-ref UNIVARS[t[i]] = '<'+varspace+t[i].replace('?', '')+'>' t[i] = '<'+varspace+t[i].replace('?', '')+'>' else: t[i] = UNIVARS[t[i]] # use the existing mapping else: # this is what we'd do if things were simpler UNIVARS[t[i]], t[i] = t[i].replace('?', ':'), t[i].replace('?', ':') # for now, just tack the var stuff on the front if ('log', LOG_NS) in PREFIXES.items(): logforAll = 'log:forAll' else: logforAll = '<'+LOG_NS+'forAll>' UNIvals = UNIVARS.values() if len(UNIVARS.values()) > 0: UNIvals.sort() t = ['this', logforAll]+' , '.join(UNIvals).split()+['.']+t if t[-1] != '.': t.append('.') # add the trailing period (formula analogy) return t def serialize(tokes): # Find which prefixes are used preused, prefixes = [], '' for i in range(len(tokes)): if re.compile('^'+QName+'$').match(tokes[i]): if tokes[i].split(':')[0] not in preused: preused.append(tokes[i].split(':')[0]) elif tokes[i].startswith('"""') and len(tokes[i]) < 70: tokes[i]='"'+tokes[i][3:-3].replace('\n','\\n').replace('"','\\"')+'"' preused.sort() # neatly order the prefixes that we are going to output for p in preused: try: prefixes += '@prefix %s: <%s> .\n' % (p, PREFIXES[p]) except KeyError: raise 'Invalid N3: The prefix "'+p+'" has been used but not declared' # Format ; and . tokens for i in range(len(tokes)): if (tokes[i] == '.') and (i+1 != len(tokes)): tokes[i+1] = '\n'+tokes[i+1] elif tokes[i] == ';': tokes[i] = '; \n ' if tokes.count('; \n ') > 0: while('; \n ' in tokes): i = tokes.index('; \n ') tokes = tokes[:i-1] + [tokes[i-1]+tokes[i]] + tokes[i+1:] # Format () DAML lists if tokes.count('('): while ('(' in tokes) or (')' in tokes): i = tokes.index('(') tokes = tokes[:i] + [tokes[i]+tokes[i+1]] + tokes[i+2:] i = tokes.index(')') tokes = tokes[:i-1] + [tokes[i-1]+tokes[i]] + tokes[i+1:] if prefixes: return prefixes+'\n'+' '.join(tokes) else: return ' '.join(tokes) def openid(id): try: return urllib.urlopen(id).read() except: return open(id, 'r').read() def preProcess(s): s = s.replace('\r\n', '\n').replace('\r', '\n') URIREF = r'<[^ >]*>' STRLITA = r'"[^"\\]*(?:\\.[^"\\]*)*"' STRLITB = r'"""[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' tokes = re.compile(r'(%s|%s|%s|#|\n|[ \t]+|\S+)' % \ (URIREF, STRLITB, STRLITA)).findall(s) + ['\n'] while '#' in tokes: cp = tokes.index('#') np = tokes[cp:].index('\n') + cp tokes = tokes[:cp] + tokes[np:] if tokes[:-1] == '\n': return ''.join(tokes[:-1]) else: return ''.join(tokes) def process(s): """Take a string, tokenize it, strip the declarations, make it backwards compatible, and format it.""" s = preProcess(s) tokes = toke(s) tokes = declarations(tokes) tokes = compat(tokes) return serialize(tokes) def run(): if sys.argv[1][0] == '-': flag, dir = sys.argv[1][1:], None if flag.count('i=') > 0: parts = flag.split('=') Options, dir = list(parts[0][:-1]), openid(parts[1]) elif 'i' in flag: Options, dir = list(flag.replace('i', '')), sys.stdin.read() else: Options = list(flag) if LocalSetup: g = declarations(toke(openid(LocalSetup))) if 'i' in flag: g = declarations(toke(dir)) # "g is for garbage..." if 'p' in Options: if ('i' in flag) and dir: feed = dir[:] else: feed = sys.stdin.read() # STDIN elif 'u' in Options: feed = openid(sys.argv[2]) # URI/fn elif 's' in Options: feed = ' '.join(sys.argv[2:]) # Command line else: raise "No input mode selected: -[pus]" if 'd' in Options: print process(feed), '\n', KEYWORDS, '\n', PREFIXES elif 't' in Options: print toke(process(feed)) else: print process(feed) else: print process(openid(sys.argv[1])) if __name__=="__main__": if len(sys.argv) > 1: run() else: print __doc__