#!/usr/bin/python """A Web browser""" import sys, os, string, re, base64 import urllib, urllib2, urlparse, httplib, htmlentitydefs from HTMLParser import HTMLParser from ftplib import FTP special = ['html', 'title'] # Block level http://www.w3.org/TR/REC-CSS2/sample block = ['address', 'blockquote', 'body', 'fieldset', 'form', 'frame', 'frameset', 'iframe', 'noframes', 'object', 'p', 'applet', 'center', 'dir', 'hr', 'menu', 'pre', 'ul', 'dl', 'ol', 'tr'] head = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] flow = ['li', 'div', 'dd', 'dt'] block.extend(head) def format(s, pre): if not pre: while string.count(s, ' ') > 0: s = string.replace(s, ' ', ' ') if s in ('', ' '): pass elif s == ' \n': print '' # hack for now elif '\n' in s: for x in string.split(s, '\n'): align(x) else: align(s) return '' def align(s, w=78): # if len(s) == 0: pass if len(s) < w: print s else: if ' ' in s[:w]: bits = string.split(s[:w], ' ') rest = bits.pop()+s[w:] s = string.join(bits, ' ') else: rest, s = s[w:], s[:w] print s align(rest) class MyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.start, self.end, self.data = None, None, '' self.preformat, self.hidden = 0, 0 def pre(self): if self.preformat == 1: return 1 elif self.end == 'pre': return 1 else: return 0 def handle_starttag(self, tag, attrs): self.start = tag if (self.data is not '') and (tag in block): self.data = format(self.data, self.pre()) # Now for some more stuff if tag == 'pre': self.data += '
\n'
         self.preformat = 1
      elif tag == 'li': self.data += ' * '
      elif tag == 'dd': self.data += ' defintion: '
      elif tag == 'head': self.hidden = 1
      elif tag == 'img': 
         for attr in attrs: 
            if attr[0] == 'alt': self.data += attr[1]
      elif tag in ('blockquote', 'q'): 
         self.data += 'quote'
         for attr in attrs: 
            if attr[0] == 'cite': self.data += ' (%s)' % attr[1]
         self.data += ': '
      elif tag == 'a': 
         self.data += '<'
         for attr in attrs: 
            if attr[0] == 'href': self.data += '<%s> ' % attr[1]
      elif tag in head: self.data += '@ '

   def handle_endtag(self, tag): 
      self.end = tag
      if tag == 'pre': 
         self.data += '\n
' self.preformat = 0 if tag == 'a': self.data += '>' elif tag == 'head': self.hidden = 0 elif tag in block: self.data = format(self.data+'\n', self.pre()) elif tag in flow: self.data = format(self.data, self.pre()) def handle_data(self, data): if (self.start in block) or (self.start == 'title'): data = string.lstrip(data) if self.start == 'pre' and self.end != 'pre': self.data += data elif self.start not in ('script', 'applet'): data = string.replace(data, '\n', ' ') data = string.replace(data, '\r', ' ') data = string.replace(data, '\t', ' ') if (not self.hidden) or (self.start == 'title'): self.data += data def handle_entityref(self, name): if name in htmlentitydefs.entitydefs.keys(): self.data += htmlentitydefs.entitydefs[name] else: self.data += '?' def httpget(uri): uri = urlparse.urlparse(uri) n, p = uri[1], uri[2] h = httplib.HTTP(n) h.putrequest('GET', p) h.putheader('Host', n) h.putheader('User-Agent', 'browser.py/x.beta') # h.putheader('Accept', 'text/html, text/plain') # h.putheader('Connection', 'Keep-Alive') h.endheaders() code, msg, headers = h.getreply() data = h.getfile().read() return code, msg, headers, data def gethttp(uri): sys.stderr.write('Getting %s...\n' % uri) code, msg, info, data = httpget(uri) if code in (301, 302): sys.stderr.write('Code was %s (%s), continue? [Y/N]: ' % \ (str(code), info['location'])) x = raw_input() if x.lower() == 'y': gethttp(info['location']) else: fs(code, msg, info, data) else: fs(code, msg, info, data) def fs(code, msg, info, data): type = '' if 'content-type' in info.keys(): sys.stderr.write('Got it: %s\n' % info['content-type']) type = info['content-type'] else: sys.stderr.write('Got it\n') if ('-text' in sys.argv) or ('--text' in sys.argv): type = 'text/plain' if ('-head' in sys.argv) or ('--head' in sys.argv): print '%s %s\n\n%s' % (str(code), str(msg), str(info)) elif type[:9] == 'text/html': MyHTMLParser().feed(data) elif type[:10] == 'text/plain': print data def getftp(uri): parsed, userpass = urlparse.urlparse(uri), 'anonymous:anonymous' n, p = parsed[1], parsed[2] if string.count(n, '@') == 1: userpass, n = n.split('@', 1) if ':' in userpass: user, pswd = userpass.split(':', 1) else: user = userpass for x in (user, pswd): x = base64.encodestring(urllib.unquote(x)).strip() ftp = FTP(n, user, pswd) dir = p.split('/') fn = dir.pop() ftp.cwd(string.join(dir, '/')) if fn[5:] == '.html': ftp.retrbinary('RETR %s' % fn, MyHTMLParser().feed) else: ftp.retrbinary('RETR %s' % fn, sys.stdout.write) ftp.quit() def geturi(uri): if uri[:7] == 'http://': gethttp(uri) elif uri[:6] == 'ftp://': getftp(uri) else: getfile(uri) def prompt(): """Prompt for a URI, and get the URI entered.""" sys.stderr.write('URI: ') geturi(raw_input()) def run(): argv = [] for arg in sys.argv: if arg[0:2] == '--': arg = arg[1:] argv.append(arg) if ('-pipe' in argv) or ('-p' in argv): geturi(sys.stdin.read()) elif len(argv) >= 2: geturi(argv[1]) else: prompt() if __name__=="__main__": run()