""" BProxy - an HTTP proxy that logs URI/title information by Sean B. Palmer, 2002-10 based heavily on Mojo Nation's proxy: http://cvs.sf.net/cgi-bin/viewcvs.cgi/mojonation/evil/proxy/AsyncMojoProxy.py with a few clean-ups TODO: * Only scan for a title when it's text/html * Enable on/off without shutting down the proxy """ import sys, os, os.path, re, time, urlparse, mimetools, BaseHTTPServer import socket, asyncore, asynchat from StringIO import StringIO ADDR_TO_BIND_TO = '127.0.0.1' def log(s): if not s.endswith('\n'): s += '\n' sys.stderr.write(s) def note(s): if not s.endswith('\n'): s += '\n' fn = time.strftime('%Y-%m-%d.log', time.gmtime(time.time())) open(fn, 'a').write(s) FILTER_ADS = os.path.exists('ad-hosts.txt') and os.path.exists('ad-paths.txt') if FILTER_ADS: ad_hosts = '(%s)' % '|'.join(open('ad-hosts.txt').read().splitlines()) ad_paths = '(%s)' % '|'.join(open('ad-paths.txt').read().splitlines()) def filterURI(host, path): return (re.compile(ad_hosts + '$').search(host) or re.compile('^[^/]*'+ad_paths).search(host+path)) class MyAsynchat(asynchat.async_chat): def log_info(self, message, type='info'): if __debug__ or type != 'info': # if __debug__? ugh sys.stderr.write('%s: %s\n' % (type, message)) class HTTPProxySender(MyAsynchat): def __init__(self, receiver, id, host, port): asynchat.async_chat.__init__(self) self.receiver = receiver self.id = id self.data = '' self.set_terminator(None) self.create_socket(socket.AF_INET, socket.SOCK_STREAM) self.host = host self.port = port try: self.connect((host, port)) except socket.error, e: log('(%d) XXX %s' % (self.id, e)) self.receiver.sender_connection_error(e) self.close() return def handle_connect(self): log('(%d) S handle_connect' % self.id) try: self.receiver.sender_is_connected() except socket.error, e: log('(%d) OOO %s' % (self.id, e)) if hasattr(self, 'receiver'): self.receiver.sender_connection_error(e) self.close() return log('(%d) sender connected' % self.id) def return_error(self, e): log('(%d) sender got socket error: %s' % (self.id, e)) if (isinstance(e, socket.error) and (type(e.args) == type(())) and (len(e.args) == 2)): e = e.args[1] # get the error string only self.receiver.error(404, 'Error connecting to %s on port %d: %s' \ % (self.host, self.port, e), response=str(e)) self.close() def collect_incoming_data(self, data): log('<== (%d) %d bytes' % (self.id, len(data))) self.parse_data(data) self.receiver.push(data) def parse_data(self, data): if self.data is not None: self.data += data title = re.compile(r'(?i)([^<]+)(?!(?:[^-]|-(?!-))*-->)') foundtitle = title.findall(data) if foundtitle: t = foundtitle[0].replace('"', '\\"').strip() t = re.sub('[\t\r\n\f]', ' ', t) note(' %s "%s"' % (self.receiver.url, t)) self.data = None def handle_close(self): log('(%d) sender closing' % self.id) timen = time.strftime('%H:%M:%S', time.gmtime(time.time())) self.receiver.close_when_done() del self.receiver # break circular reference self.close() class HTTPProxyReceiver(MyAsynchat): channel_counter = [0] def __init__(self, server, (conn, addr)): self.id = self.channel_counter[0] # used during log calls try: self.channel_counter[0] += 1 except OverflowError: self.channel_counter[0] = 0 asynchat.async_chat.__init__(self, conn) self.set_terminator('\n') self.server = server self.buffer = StringIO() # in the beginning there was GET... self.found_terminator = self.read_http_request def collect_incoming_data(self, data): self.buffer.write(data) def push_incoming_data_to_sender(self, data): # e.g. when using POST or PUT log('==> (%d) %d bytes' % (self.id, len(data))) self.sender.push(data) def read_http_request(self): request = self.buffer.getvalue() self.buffer = StringIO() log('%s - %s' % (time.ctime(time.time()), request)) # client-originated shutdown hack: if request.strip() == 'quit': log('External quit command received.') raise asyncore.ExitNow try: self.method, self.url, self.protocol = request.split() self.method = self.method.upper() except: self.error(400, "Can't parse request") if not self.url: self.error(400, "Empty URL") else: timen = time.strftime('%H:%M:%S', time.gmtime(time.time())) note('%s %s' % (timen, self.url)) if self.method not in ['CONNECT', 'GET', 'HEAD', 'POST', 'PUT']: self.error(501, "Unknown request method (%s)" % self.method) if self.method == 'CONNECT': self.netloc = self.url self.scheme = 'https' self.path = '' params, query, fragment = '', '', '' else: # split url into site and path (self.scheme, self.netloc, self.path, params, query, fragment) = urlparse.urlparse(self.url) if self.scheme.lower() not in ('http', ''): self.error(501, "Unknown request scheme (%s)" % self.scheme) # find port number if ':' in self.netloc: self.host, self.port = self.netloc.split(':') self.port = int(self.port) else: self.host = self.netloc if self.method == 'CONNECT': self.port = 443 # default SSL port else: self.port = 80 # now we have the url and host if FILTER_ADS: if filterURI(self.host, self.path): self.error(404, "Not found: banned") del self.initiate_send # gives a big error, not many small ones self.original_host_and_port = None self.path = urlparse.urlunparse(('', '', self.path, params, query, fragment)) # now we have the url, host, and path if (self.host == '') and self.path.startswith('/'): path = self.path.lstrip('/') if path == 'off': PROXY_STATE = 0 elif path == 'on': PROXY_STATE = 1 elif path == 'refererOn': REFERER_STATE = 1 elif path == 'refererOff': REFERER_STATE = 0 # a "file" to read the headers into for mimetools.Message self.rawheaders = StringIO() self.found_terminator = self.read_http_headers def read_http_headers(self): header = self.buffer.getvalue() self.buffer = StringIO() if header and header[0] != '\r': self.rawheaders.write(header) self.rawheaders.write('\n') else: # all headers have been read, process them self.rawheaders.seek(0) self.mimeheaders = mimetools.Message(self.rawheaders) if ((self.method == 'POST' or self.method == 'PUT') and not self.mimeheaders.has_key('content-length')): self.error(400, "Missing Content-Length " "for %s method" % self.method) self.length = int(self.mimeheaders.get('content-length', 0)) del self.mimeheaders['accept-encoding'] del self.mimeheaders['proxy-connection'] # put in whatever User-Agent here ua = 'Mozilla/4.0 ' if self.host.endswith('microsoft.com'): ua += '(compatible; MSIE 5.0; Windows ME) Opera 6.01 [en]\r' else: ua += '(compatible; MSIE 6.0; Windows 98; Win 9x 4.90)\r' self.mimeheaders['User-Agent'] = ua # strip off referer from urls we don't want referer headers on # referer = self.mimeheaders.get('referer', 0) # if (referer and # referer_to_strip_re.search(self.mimeheaders['referer'])): # IMHO, we should -always- do this but unfortunately some # stupid web sites probably depend on it. -greg # del self.mimeheaders['referer'] if self.port == 80: self.mimeheaders['Host'] = self.host else: # @@ is the host header ever supposed to have the port? self.mimeheaders['Host'] = '%s:%s' % (self.host, self.port) self.mimeheaders['Host'] += '\r' # some sites break without this self.sender = HTTPProxySender(self, self.id, self.host, self.port) self.push_request_to_sender() def push_request_to_sender(self): headers = ''.join(self.mimeheaders.headers) request = '%s %s HTTP/1.0\r\n%s\r\n' % (self.method, self.path, headers) if self.original_host_and_port: log('(%d) sending req. (original_host_and_port):' % self.id) else: log('(%d) sending request to server:' % self.id) log(`request`) self.sender.push(request) self.set_terminator(None) self.buffer = StringIO() def sender_is_connected(self): """The sender calls this to tell us when it is ready for more data.""" log('(%d) R sender_is_connected()' % self.id) # sender gave us the OK, give it our buffered data and any future data self.push_incoming_data_to_sender(self.buffer.getvalue()) self.buffer = None self.collect_incoming_data = self.push_incoming_data_to_sender def sender_connection_error(self, e): log('(%d) R sender_connection_error(%s) for %s:%s\n' % (self.id, e, self.host, self.port)) # if this was a redirected request and the redirection failed... if self.original_host_and_port: self.sender = HTTPProxySender(self, self.id, self.host, self.port) self.push_request_to_sender() return if (isinstance(e, socket.error) and type(e.args) == type(()) and len(e.args) == 2): e = e.args[1] # get the error string only self.error(404, 'Error connecting to %s on port ' '%d: %s' % (self.host, self.port, e), response=str(e)) def handle_close(self): log('(%d) receiver closing' % self.id) if hasattr(self, 'sender'): # self.sender.close() should be fine except for PUT requests? self.sender.close_when_done() del self.sender # break circular reference self.close() def show_error(self, code, body, response=None): if not response: response = BaseHTTPServer.BaseHTTPRequestHandler.responses[code][0] self.push("HTTP/1.0 %s %s\r\n" % (code, response)) self.push("Server: B-Proxy\r\n") self.push("Content-type: text/html\r\n") self.push("\r\n") self.push('\n\n%d %s\n\n' '\n%s\n\n' % (code, response, body)) def error(self, code, body, response=None): self.show_error(code, body, response=response) if hasattr(self, 'sender'): self.sender.handle_close() del self.sender # break circular reference self.close() class HTTPProxyServer(asyncore.dispatcher): def __init__(self, port): asyncore.dispatcher.__init__(self) self.create_socket(socket.AF_INET, socket.SOCK_STREAM) self.set_reuse_addr() self.ouraddr = (ADDR_TO_BIND_TO, port) log('Starting proxy on %s port %d' % self.ouraddr) self.bind(self.ouraddr) self.listen(5) self.rs = [] def handle_accept(self): HTTPProxyReceiver(self, self.accept()) def log_info(self, message, type='info'): if __debug__ or type != 'info': sys.stderr.write('%s: %s\n' % (type, message)) if __name__ == '__main__': log('Stopping external proxies:') s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: try: s.connect(('localhost', 8000)) s.send('quit\r\n') finally: s.close() except: log('Could not connect to locahost 8000, oh well...') if len(sys.argv) >= 2: PORT = int(sys.argv[1]) else: PORT = 8000 ps = HTTPProxyServer(PORT) log('Starting service...') asyncore.loop() # [EOF]