"""
BProxy - an HTTP proxy that logs URI/title information
by Sean B. Palmer, 2002-10
based heavily on Mojo Nation's proxy:
http://cvs.sf.net/cgi-bin/viewcvs.cgi/mojonation/evil/proxy/AsyncMojoProxy.py
with a few clean-ups
TODO:
* Only scan for a title when it's text/html
* Enable on/off without shutting down the proxy
"""
import sys, os, os.path, re, time, urlparse, mimetools, BaseHTTPServer
import socket, asyncore, asynchat
from StringIO import StringIO
ADDR_TO_BIND_TO = '127.0.0.1'
def log(s):
if not s.endswith('\n'): s += '\n'
sys.stderr.write(s)
def note(s):
if not s.endswith('\n'): s += '\n'
fn = time.strftime('%Y-%m-%d.log', time.gmtime(time.time()))
open(fn, 'a').write(s)
FILTER_ADS = os.path.exists('ad-hosts.txt') and os.path.exists('ad-paths.txt')
if FILTER_ADS:
ad_hosts = '(%s)' % '|'.join(open('ad-hosts.txt').read().splitlines())
ad_paths = '(%s)' % '|'.join(open('ad-paths.txt').read().splitlines())
def filterURI(host, path):
return (re.compile(ad_hosts + '$').search(host) or
re.compile('^[^/]*'+ad_paths).search(host+path))
class MyAsynchat(asynchat.async_chat):
def log_info(self, message, type='info'):
if __debug__ or type != 'info': # if __debug__? ugh
sys.stderr.write('%s: %s\n' % (type, message))
class HTTPProxySender(MyAsynchat):
def __init__(self, receiver, id, host, port):
asynchat.async_chat.__init__(self)
self.receiver = receiver
self.id = id
self.data = ''
self.set_terminator(None)
self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
self.host = host
self.port = port
try: self.connect((host, port))
except socket.error, e:
log('(%d) XXX %s' % (self.id, e))
self.receiver.sender_connection_error(e)
self.close()
return
def handle_connect(self):
log('(%d) S handle_connect' % self.id)
try: self.receiver.sender_is_connected()
except socket.error, e:
log('(%d) OOO %s' % (self.id, e))
if hasattr(self, 'receiver'):
self.receiver.sender_connection_error(e)
self.close()
return
log('(%d) sender connected' % self.id)
def return_error(self, e):
log('(%d) sender got socket error: %s' % (self.id, e))
if (isinstance(e, socket.error)
and (type(e.args) == type(()))
and (len(e.args) == 2)): e = e.args[1] # get the error string only
self.receiver.error(404,
'Error connecting to %s on port %d: %s' \
% (self.host, self.port, e), response=str(e))
self.close()
def collect_incoming_data(self, data):
log('<== (%d) %d bytes' % (self.id, len(data)))
self.parse_data(data)
self.receiver.push(data)
def parse_data(self, data):
if self.data is not None:
self.data += data
title = re.compile(r'(?i)
([^<]+)(?!(?:[^-]|-(?!-))*-->)')
foundtitle = title.findall(data)
if foundtitle:
t = foundtitle[0].replace('"', '\\"').strip()
t = re.sub('[\t\r\n\f]', ' ', t)
note(' %s "%s"' % (self.receiver.url, t))
self.data = None
def handle_close(self):
log('(%d) sender closing' % self.id)
timen = time.strftime('%H:%M:%S', time.gmtime(time.time()))
self.receiver.close_when_done()
del self.receiver # break circular reference
self.close()
class HTTPProxyReceiver(MyAsynchat):
channel_counter = [0]
def __init__(self, server, (conn, addr)):
self.id = self.channel_counter[0] # used during log calls
try: self.channel_counter[0] += 1
except OverflowError:
self.channel_counter[0] = 0
asynchat.async_chat.__init__(self, conn)
self.set_terminator('\n')
self.server = server
self.buffer = StringIO()
# in the beginning there was GET...
self.found_terminator = self.read_http_request
def collect_incoming_data(self, data):
self.buffer.write(data)
def push_incoming_data_to_sender(self, data):
# e.g. when using POST or PUT
log('==> (%d) %d bytes' % (self.id, len(data)))
self.sender.push(data)
def read_http_request(self):
request = self.buffer.getvalue()
self.buffer = StringIO()
log('%s - %s' % (time.ctime(time.time()), request))
# client-originated shutdown hack:
if request.strip() == 'quit':
log('External quit command received.')
raise asyncore.ExitNow
try:
self.method, self.url, self.protocol = request.split()
self.method = self.method.upper()
except: self.error(400, "Can't parse request")
if not self.url: self.error(400, "Empty URL")
else:
timen = time.strftime('%H:%M:%S', time.gmtime(time.time()))
note('%s %s' % (timen, self.url))
if self.method not in ['CONNECT', 'GET', 'HEAD', 'POST', 'PUT']:
self.error(501, "Unknown request method (%s)" % self.method)
if self.method == 'CONNECT':
self.netloc = self.url
self.scheme = 'https'
self.path = ''
params, query, fragment = '', '', ''
else:
# split url into site and path
(self.scheme, self.netloc, self.path,
params, query, fragment) = urlparse.urlparse(self.url)
if self.scheme.lower() not in ('http', ''):
self.error(501, "Unknown request scheme (%s)" % self.scheme)
# find port number
if ':' in self.netloc:
self.host, self.port = self.netloc.split(':')
self.port = int(self.port)
else:
self.host = self.netloc
if self.method == 'CONNECT': self.port = 443 # default SSL port
else: self.port = 80
# now we have the url and host
if FILTER_ADS:
if filterURI(self.host, self.path):
self.error(404, "Not found: banned")
del self.initiate_send # gives a big error, not many small ones
self.original_host_and_port = None
self.path = urlparse.urlunparse(('', '',
self.path, params, query, fragment))
# now we have the url, host, and path
if (self.host == '') and self.path.startswith('/'):
path = self.path.lstrip('/')
if path == 'off': PROXY_STATE = 0
elif path == 'on': PROXY_STATE = 1
elif path == 'refererOn': REFERER_STATE = 1
elif path == 'refererOff': REFERER_STATE = 0
# a "file" to read the headers into for mimetools.Message
self.rawheaders = StringIO()
self.found_terminator = self.read_http_headers
def read_http_headers(self):
header = self.buffer.getvalue()
self.buffer = StringIO()
if header and header[0] != '\r':
self.rawheaders.write(header)
self.rawheaders.write('\n')
else:
# all headers have been read, process them
self.rawheaders.seek(0)
self.mimeheaders = mimetools.Message(self.rawheaders)
if ((self.method == 'POST' or self.method == 'PUT')
and not self.mimeheaders.has_key('content-length')):
self.error(400, "Missing Content-Length "
"for %s method" % self.method)
self.length = int(self.mimeheaders.get('content-length', 0))
del self.mimeheaders['accept-encoding']
del self.mimeheaders['proxy-connection']
# put in whatever User-Agent here
ua = 'Mozilla/4.0 '
if self.host.endswith('microsoft.com'):
ua += '(compatible; MSIE 5.0; Windows ME) Opera 6.01 [en]\r'
else:
ua += '(compatible; MSIE 6.0; Windows 98; Win 9x 4.90)\r'
self.mimeheaders['User-Agent'] = ua
# strip off referer from urls we don't want referer headers on
# referer = self.mimeheaders.get('referer', 0)
# if (referer and
# referer_to_strip_re.search(self.mimeheaders['referer'])):
# IMHO, we should -always- do this but unfortunately some
# stupid web sites probably depend on it. -greg
# del self.mimeheaders['referer']
if self.port == 80: self.mimeheaders['Host'] = self.host
else: # @@ is the host header ever supposed to have the port?
self.mimeheaders['Host'] = '%s:%s' % (self.host, self.port)
self.mimeheaders['Host'] += '\r' # some sites break without this
self.sender = HTTPProxySender(self, self.id, self.host, self.port)
self.push_request_to_sender()
def push_request_to_sender(self):
headers = ''.join(self.mimeheaders.headers)
request = '%s %s HTTP/1.0\r\n%s\r\n' % (self.method, self.path, headers)
if self.original_host_and_port:
log('(%d) sending req. (original_host_and_port):' % self.id)
else: log('(%d) sending request to server:' % self.id)
log(`request`)
self.sender.push(request)
self.set_terminator(None)
self.buffer = StringIO()
def sender_is_connected(self):
"""The sender calls this to tell us when it is ready for more data."""
log('(%d) R sender_is_connected()' % self.id)
# sender gave us the OK, give it our buffered data and any future data
self.push_incoming_data_to_sender(self.buffer.getvalue())
self.buffer = None
self.collect_incoming_data = self.push_incoming_data_to_sender
def sender_connection_error(self, e):
log('(%d) R sender_connection_error(%s) for %s:%s\n' % (self.id, e,
self.host, self.port))
# if this was a redirected request and the redirection failed...
if self.original_host_and_port:
self.sender = HTTPProxySender(self, self.id, self.host, self.port)
self.push_request_to_sender()
return
if (isinstance(e, socket.error)
and type(e.args) == type(())
and len(e.args) == 2): e = e.args[1] # get the error string only
self.error(404, 'Error connecting to %s on port '
'%d: %s' % (self.host, self.port, e), response=str(e))
def handle_close(self):
log('(%d) receiver closing' % self.id)
if hasattr(self, 'sender'):
# self.sender.close() should be fine except for PUT requests?
self.sender.close_when_done()
del self.sender # break circular reference
self.close()
def show_error(self, code, body, response=None):
if not response:
response = BaseHTTPServer.BaseHTTPRequestHandler.responses[code][0]
self.push("HTTP/1.0 %s %s\r\n" % (code, response))
self.push("Server: B-Proxy\r\n")
self.push("Content-type: text/html\r\n")
self.push("\r\n")
self.push('\n\n%d %s\n\n'
'\n%s\n\n' % (code, response, body))
def error(self, code, body, response=None):
self.show_error(code, body, response=response)
if hasattr(self, 'sender'):
self.sender.handle_close()
del self.sender # break circular reference
self.close()
class HTTPProxyServer(asyncore.dispatcher):
def __init__(self, port):
asyncore.dispatcher.__init__(self)
self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
self.set_reuse_addr()
self.ouraddr = (ADDR_TO_BIND_TO, port)
log('Starting proxy on %s port %d' % self.ouraddr)
self.bind(self.ouraddr)
self.listen(5)
self.rs = []
def handle_accept(self):
HTTPProxyReceiver(self, self.accept())
def log_info(self, message, type='info'):
if __debug__ or type != 'info':
sys.stderr.write('%s: %s\n' % (type, message))
if __name__ == '__main__':
log('Stopping external proxies:')
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
try:
s.connect(('localhost', 8000))
s.send('quit\r\n')
finally: s.close()
except: log('Could not connect to locahost 8000, oh well...')
if len(sys.argv) >= 2: PORT = int(sys.argv[1])
else: PORT = 8000
ps = HTTPProxyServer(PORT)
log('Starting service...')
asyncore.loop()
# [EOF]