#!/usr/bin/python
"""n3s - A Notation3 command line preprocessor and editor.
Sean B. Palmer <http://purl.org/net/sbp/>, April 2002

 Basic usage: python n3s -[pus]

   -p: pipe in
      e.g. `cat FileName | python n3s -p`
   -u: use file (default)
      e.g. `python n3s -u FileOrURI` or `python n3s FileOrURI`
   -s: use command line string
      e.g. `python n3s -s '{ ?x rdfs:label "x" } => { :Test a :Pass }'`

 Further Usage: python n3s -[td][pus][i[=id]]

   -t tokenize
   -d debug
   -i load a file, and use the stuff declared within

For more information: http://infomesh.net/2002/n3s/

Thanks to deltab and Aaron Swartz for their help and suggestions.
"""

__author__ = 'Sean B. Palmer'
__license__ = 'Copyright (C) 2002 Sean B. Palmer. GNU GPL 2'

import sys, re, time, random, urllib

# Set the following variable to a file name or URI if you 
# want to use a default file rather than relying on the 
# "i" command line mode.

LocalSetup = None 

# -- End of configurable stuff. Snip here. Go home, folks --

varspace, default = 'tag:n3s.infomesh.net,%s%s#' % (
   time.strftime('%Y-%m-%d:%H%M%S',  time.gmtime(time.time())), 
   ''.join([random.choice('0123456789abcdefghijklmnopqrstuvwzyz') 
            for x in ' '*10])), ' '

# More magic stuff that we know about

RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' # for 'a' keyword
DAML_NS = 'http://www.daml.org/2001/03/daml+oil#' # for '=' keyword
LOG_NS = 'http://www.w3.org/2000/10/swap/log#' # for forAll/forSome keywords

PREFIXES = { 
   default: '#', 
   '': '#', 
   'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 
   'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 
   'daml': 'http://www.daml.org/2001/03/daml+oil#', 
   'dpo': 'http://www.daml.org/2001/03/daml+oil#', 
   'log': 'http://www.w3.org/2000/10/swap/log#', 
   'string': 'http://www.w3.org/2000/10/swap/string#', 
   'crypto': 'http://www.w3.org/2000/10/swap/crypto#', 
   'dc': 'http://purl.org/dc/elements/1.1/', 
   'dct': 'http://purl.org/dc/terms/', 
   'foaf': 'http://xmlns.com/foaf/0.1/', 
   'wot': 'http://xmlns.com/wot/0.1/', 
   'earl': 'http://www.w3.org/2001/03/earl/0.95#', 
   'doc': 'http://www.w3.org/2000/10/swap/pim/doc#', 
   'swn': 'http://purl.org/net/swn#', 
   'a': 'http://www.megginson.com/exp/ns/airports#', 
   'contact': 'http://www.w3.org/2000/10/swap/pim/contact#', 
   'i': 'http://www.w3.org/2001/04/infoset#', 
   'math': 'http://www.w3.org/2000/10/swap/math#', 
   'os': 'http://www.w3.org/2000/10/swap/os#', 
   'rcs': 'http://www.w3.org/2001/03swell/rcs#', 
   'wn': 'http://xmlns.com/wordnet/1.6/', 
   'v': varspace
}

KEYWORDS = {
   'forall': [LOG_NS, 'forAll'], 
   'forsome': [LOG_NS, 'forSome'], 
   '=>': [LOG_NS, 'implies'], 
   '=': [DAML_NS, 'equivalentTo'], 
   'a': [RDF_NS, 'type'], # since not allowed as subject
   'this': None, 
   'is': None, 
   'of': None
}

def group(*n): 
   return '(%s)' % '|'.join(n)

# These are the basic Notation3 tokens, with some extensions

Name = r'[A-Za-z0-9_]+'
URI = r'<[^ >]*>'
bNode = r'_:' + Name
Univar = r'\?' + Name
Prefix = r'(?:[A-Za-z][A-Za-z0-9_]*)?:'
QName = Prefix + Name
Literal = r'"[^"\\]*(?:\\.[^"\\]*)*"'
LLiteral = r'"""[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
AtPrefix = r'@prefix'
AtUse = r'@use'
AtKeyWord = r'@keyword'
WS = r'[ \t]'

Tokens = group(LLiteral, URI, Literal, AtKeyWord, AtPrefix, ':-', QName, 
   AtUse, bNode, Prefix, Name, Univar, 'is', 'of', '=>', '=', '{', '}', 
   '\(', '\)', '\[', '\]', ',', ';', '\.', WS, '\n')
Token = re.compile(Tokens, re.S)

# # # # # # # # # # # # # # # # # # # # # # #
# 
# TOKENIZER: General tokenizing functions
# 

def notComment(s): 
   N3Comment = re.compile(r'([ \t]*\#[^\n]*)', re.S)
   if N3Comment.match(s): return ''
   else: return s

def toke(s, FILTER=1): 
   """Notation3 tokenizer. Takes in a string, returns a raw token list."""
   if len(s) == 0: raise 'Document has no content'
   s = '\n'.join([notComment(line) for line in s.replace('\r\n', 
                 '\n').replace('\r', '\n').split('\n')]).strip()
   if FILTER: 
      return filter(lambda x: x not in list(' \t\n\r'), Token.findall(s))
   else: return Token.findall(s)

# 
# # # # # # # # # # # # # # # # # # # # # # #

def declarations(tokes): 
   """This parses a list of tokens, mapping a set of keywords to 
      their actual QNames, keywords, or URIs."""
   # Get all the prefixes
   t = time.time()
   while '@keyword' in tokes: 
      i = tokes.index('@keyword')
      j = tokes[i:].index('.')+i+1
      keywords, tokes = ''.join(tokes[i+1:j-1]).split(','), tokes[:i]+tokes[j:]
      for k in keywords: 
         if k in KEYWORDS.keys(): pass
         else: KEYWORDS[k] = None
   while '@use' in tokes: 
      i = tokes.index('@use')
      j = tokes[i:].index('.')+i+1
      use, tokes = tokes[i:j], tokes[:i]+tokes[j:]
      for k in ''.join(use[1:-2]).split(','): 
         if k not in KEYWORDS.keys(): KEYWORDS[k] = [use[-2][1:-1], k]
   while '@prefix' in tokes: 
      i = tokes.index('@prefix')
      if tokes[i+1] != 'default': PREFIXES[tokes[i+1][:-1]] = tokes[i+2][1:-1]
      else: PREFIXES[default] = tokes[i+2][1:-1]
      if tokes[i+3] != '.': raise "Syntax error: "+' '.join(tokes[i:i+5])
      tokes = tokes[:i] + tokes[i+4:]
   return tokes

def compat(t): 
   """Takes in a set of declarationless tokens."""
   RevPREF, UNIVARS = {}, {} # RevPREF is PREFIXES inside-out
   for k in PREFIXES.keys(): 
      if k != default: RevPREF[PREFIXES[k]] = k
   for i in range(len(t)): 
      if ':' in t[i]: pass # it's a QName, so skip it. (for speed)
      elif t[i] in KEYWORDS.keys(): 
         if KEYWORDS[t[i]] is None: pass
         elif KEYWORDS[t[i]][0] in RevPREF.keys(): # make QName
            t[i] = RevPREF[KEYWORDS[t[i]][0]]+':'+KEYWORDS[t[i]][1]
         else: t[i] = '<'+''.join(KEYWORDS[t[i]])+'>' # make URI-ref
      elif re.compile('^'+Name+'$').match(t[i]): 
         if default in PREFIXES.keys(): 
            if PREFIXES[default] in RevPREF.keys(): 
               t[i] = RevPREF[PREFIXES[default]]+':'+t[i] # make QName
            else: t[i] = '<'+PREFIXES[default]+t[i]+'>' # make URI-ref
         else: raise "Keyword "+t[i]+" not declared"
      elif re.compile('^'+Univar+'$').match(t[i]): 
         # Congratulations, it's a Univar! Store it in the UNIVARS dictionary
         if t[i] not in UNIVARS: UNIVARS[t[i]] = None
         # Now do the replacements - need to check the names
         if t[i].replace('?', ':') in t: # problem! generate id, store mapping
            if UNIVARS[t[i]] is None: # we haven't found it before
               if ('v', varspace) in PREFIXES.items(): # use a QName
                  UNIVARS[t[i]] = t[i].replace('?', 'v:')
                  t[i] = t[i].replace('?', 'v:')
               else: # the 'v:' prefix has been overwritten, so use a URI-ref
                  UNIVARS[t[i]] = '<'+varspace+t[i].replace('?', '')+'>'
                  t[i] = '<'+varspace+t[i].replace('?', '')+'>'
            else: t[i] = UNIVARS[t[i]] # use the existing mapping
         else: # this is what we'd do if things were simpler
            UNIVARS[t[i]], t[i] = t[i].replace('?', ':'), t[i].replace('?', ':')
   # for now, just tack the var stuff on the front
   if ('log', LOG_NS) in PREFIXES.items(): logforAll = 'log:forAll'
   else: logforAll = '<'+LOG_NS+'forAll>'
   UNIvals = UNIVARS.values()
   if len(UNIVARS.values()) > 0: 
      UNIvals.sort()
      t = ['this', logforAll]+' , '.join(UNIvals).split()+['.']+t
   if t[-1] != '.': t.append('.') # add the trailing period (formula analogy)
   return t

def serialize(tokes): 
   # Find which prefixes are used
   preused, prefixes = [], ''
   for i in range(len(tokes)): 
      if re.compile('^'+QName+'$').match(tokes[i]): 
         if tokes[i].split(':')[0] not in preused: 
            preused.append(tokes[i].split(':')[0])
      elif tokes[i].startswith('"""') and len(tokes[i]) < 70: 
         tokes[i]='"'+tokes[i][3:-3].replace('\n','\\n').replace('"','\\"')+'"'
   preused.sort() # neatly order the prefixes that we are going to output
   for p in preused: 
      try: prefixes += '@prefix %s: <%s> .\n' % (p, PREFIXES[p])
      except KeyError: 
         raise 'Invalid N3: The prefix "'+p+'" has been used but not declared'
   # Format ; and . tokens
   for i in range(len(tokes)): 
      if (tokes[i] == '.') and (i+1 != len(tokes)): 
         tokes[i+1] = '\n'+tokes[i+1]
      elif tokes[i] == ';': tokes[i] = '; \n  '
   if tokes.count('; \n  ') > 0: 
      while('; \n  ' in tokes): 
         i = tokes.index('; \n  ')
         tokes = tokes[:i-1] + [tokes[i-1]+tokes[i]] + tokes[i+1:]
   # Format () DAML lists
   if tokes.count('('): 
      while ('(' in tokes) or (')' in tokes): 
         i = tokes.index('(')
         tokes = tokes[:i] + [tokes[i]+tokes[i+1]] + tokes[i+2:]
         i = tokes.index(')')
         tokes = tokes[:i-1] + [tokes[i-1]+tokes[i]] + tokes[i+1:]
   if prefixes: return prefixes+'\n'+' '.join(tokes)
   else: return ' '.join(tokes)

def openid(id): 
   try: return urllib.urlopen(id).read()
   except: return open(id, 'r').read()

def preProcess(s): 
    s = s.replace('\r\n', '\n').replace('\r', '\n')
    URIREF = r'<[^ >]*>'
    STRLITA = r'"[^"\\]*(?:\\.[^"\\]*)*"'
    STRLITB = r'"""[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
    tokes = re.compile(r'(%s|%s|%s|#|\n|[ \t]+|\S+)' % \
        (URIREF, STRLITB, STRLITA)).findall(s) + ['\n']
    while '#' in tokes: 
       cp = tokes.index('#')
       np = tokes[cp:].index('\n') + cp
       tokes = tokes[:cp] + tokes[np:]
    if tokes[:-1] == '\n': return ''.join(tokes[:-1])
    else: return ''.join(tokes)

def process(s): 
   """Take a string, tokenize it, strip the declarations, 
      make it backwards compatible, and format it."""
   s = preProcess(s)
   tokes = toke(s)
   tokes = declarations(tokes)
   tokes = compat(tokes)
   return serialize(tokes)

def run(): 
   if sys.argv[1][0] == '-': 
      flag, dir = sys.argv[1][1:], None

      if flag.count('i=') > 0: 
         parts = flag.split('=')
         Options, dir = list(parts[0][:-1]), openid(parts[1])
      elif 'i' in flag: 
         Options, dir = list(flag.replace('i', '')), sys.stdin.read()
      else: Options = list(flag)

      if LocalSetup: g = declarations(toke(openid(LocalSetup)))
      if 'i' in flag: g = declarations(toke(dir)) # "g is for garbage..."

      if 'p' in Options: 
         if ('i' in flag) and dir: feed = dir[:]
         else: feed = sys.stdin.read() # STDIN
      elif 'u' in Options: feed = openid(sys.argv[2]) # URI/fn
      elif 's' in Options: feed = ' '.join(sys.argv[2:]) # Command line
      else: raise "No input mode selected: -[pus]"

      if 'd' in Options: print process(feed), '\n', KEYWORDS, '\n', PREFIXES
      elif 't' in Options: print toke(process(feed))
      else: print process(feed)
   else: print process(openid(sys.argv[1]))

if __name__=="__main__": 
   if len(sys.argv) > 1: run()
   else: print __doc__