#!/usr/bin/env python """parse.py - Parsing utilities for pyrple.""" import sys, os, re, urllib import cPickle as pickle # Should we cache by default or not? # cache = '~/.pyrple/cache/' cache = False # Tools to get content from the Web def get(uri, v=None): # @@ send an accept: application/rdf+xml header u = urllib.urlopen(uri) # @@ urllib2 info = u.info() contentType = info.get('Content-Type') lastModified = info.get('Last-Modified') contentLength = info.get('Content-Length') s = None global cache if cache: cache = os.path.expanduser(cache) if cache: if not os.path.exists(cache): os.makedirs(cache) # get a cached version if there is one fn = urllib.quote(uri, safe='') if os.path.exists(cache + fn) and contentType and lastModified: cached = pickle.load(open(cache + fn, 'rb')) if cached[:3] == (contentType, lastModified, contentLength): if v: print >> sys.stderr, "Using cached data: %s" % lastModified s = cached[3] elif os.path.exists(cache + fn): if v: print >> sys.stderr, "Removing cache: new version uncacheable" os.remove(cache + fn) # @@ # since there's no cached data, read from the Web if s is None: if not contentLength: s = u.read() else: s = u.read(int(contentLength)) u.close() if cache and contentType and lastModified: if v: print >> sys.stderr, "Caching version: %s" % lastModified cached = (contentType, lastModified, contentLength, s) pickle.dump(cached, open(cache + fn, 'wb')) else: u.close() return (contentType, s) # Tools to guage the serialization type RDFXML = 'application/rdf+xml' N3 = 'application/n3' NTRIPLES = 'application/n-triples' HTML = 'text/html' r_xml = re.compile(r'^[\t\r\n ]*(<[?!]|<[^ >]+ )') def getType(contentType, s): """Guess the RDF serialization type of the input.""" if contentType is None: contentType = '' result = None if contentType.startswith('application/rdf+xml'): return RDFXML elif contentType.endswith('notation3') or contentType.endswith('n3'): return N3 elif contentType in ('text/xml', 'application/xml'): # @@ check root namespace? option to extract if transparent? return RDFXML elif (contentType.startswith('text/html') or contentType.startswith('application/xhtml+xml')): return HTML elif (contentType.endswith('ntriples') or contentType.endswith('n-triples') or contentType.endswith('nt')): return NTRIPLES elif contentType in ('text/plain', 'application/octet-stream', ''): if r_xml.match(s): result = RDFXML else: try: from ntriplesg import isNTriples except ImportError: result = N3 else: if not isNTriples(s): result = N3 else: result = NTRIPLES else: raise "NotImplemented", "Unknown MIME type: %s" % contentType return result # Tools to extact RDF from HTML r_rdfxml = re.compile(r'(?sm)(<(([^\s:]+:)?)RDF .+?)') def parseFromHTML(s, base=None): if base is None: base = '' result = [] data = [groups[0] for groups in r_rdfxml.findall(s)] if data: for rdfxml in data: result.append((base, RDFXML, rdfxml)) return result if __name__=="__main__": print __doc__