Running an HTTP Proxy Server

Besides HTTP servers and clients, twisted.web includes support for writing HTTP proxies. A proxy is a client and server in one: it accepts requests from clients (acting as a server) and forwards them to servers (acting as a client). Then it sends the response back to the client who originally sent the request. HTTP proxies are useful mostly for the additional services they can provide, such as caching, filtering, and usage reporting. This lab shows how to build an HTTP proxy using Twisted.

4.6.1. How Do I Do That?

The twisted.web package includes twisted.web.proxy, a module with classes for building HTTP proxies. Example 4-7 shows how easy it is to set up a basic proxy.

Example 4-7. simpleproxy.py

from twisted.web import proxy, http from twisted.internet import reactor from twisted.python import log import sys log.startLogging(sys.stdout) class ProxyFactory(http.HTTPFactory): protocol = proxy.Proxy reactor.listenTCP(8001, ProxyFactory( )) reactor.run( )

Run simpleproxy.py from the command line and you'll have an HTTP proxy running on localhost port 8001. Set up a web browser to use this proxy and try surfing some web pages. The call to log.startLogging prints all HTTP log messages to stdout so you can watch the proxy at work:

$ python simpleproxy.py 2005/06/13 00:22 EDT [-] Log opened. 2005/06/13 00:22 EDT [-] _ _main_ _.ProxyFactory starting on 8001 2005/06/13 00:22 EDT [-] Starting factory <_ _main_ _.ProxyFactory instance at 0xb7d9d10c> 2005/06/13 00:23 EDT [Proxy,0,127.0.0.1] Starting factory 2005/06/13 00:23 EDT [-] Enabling Multithreading. 2005/06/13 00:23 EDT [Proxy,1,127.0.0.1] Starting factory 2005/06/13 00:23 EDT [Proxy,2,127.0.0.1] Starting factory ...

That gives you a working proxy, but not one that does anything useful. Example 4-8 dives deeper into the twisted.web.proxy module to build a proxy that keeps track of the most frequently used words in the HTML documents being browsed.

Example 4-8. wordcountproxy.py

import sgmllib, re from twisted.web import proxy, http import sys from twisted.python import log log.startLogging(sys.stdout) WEB_PORT = 8000 PROXY_PORT = 8001 class WordParser(sgmllib.SGMLParser): def _ _init_ _(self): sgmllib.SGMLParser._ _init_ _(self) self.chardata = [] self.inBody = False def start_body(self, attrs): self.inBody = True def end_body(self): self.inBody = False def handle_data(self, data): if self.inBody: self.chardata.append(data) def getWords(self): # extract words wordFinder = re.compile(r'w*') words = wordFinder.findall("".join(self.chardata)) words = filter(lambda word: word.strip( ), words) print "WORDS ARE", words return words class WordCounter(object): ignoredWords = "the a of in from to this that and or but is was be can could i you they we at".split( ) def _ _init_ _(self): self.words = {} def addWords(self, words): for word in words: word = word.lower( ) if not word in self.ignoredWords: currentCount = self.words.get(word, 0) self.words[word] = currentCount + 1 class WordCountProxyClient(proxy.ProxyClient): def handleHeader(self, key, value): proxy.ProxyClient.handleHeader(self, key, value) if key.lower( ) == "content-type": if value.split(';')[0] == 'text/html': self.parser = WordParser( ) def handleResponsePart(self, data): proxy.ProxyClient.handleResponsePart(self, data) if hasattr(self, 'parser'): self.parser.feed(data) def handleResponseEnd(self): proxy.ProxyClient.handleResponseEnd(self) if hasattr(self, 'parser'): self.parser.close( ) self.father.wordCounter.addWords(self.parser.getWords( )) del(self.parser) class WordCountProxyClientFactory(proxy.ProxyClientFactory): def buildProtocol(self, addr): client = proxy.ProxyClientFactory.buildProtocol(self, addr) # upgrade proxy.proxyClient object to WordCountProxyClient client._ _class_ _ = WordCountProxyClient return client class WordCountProxyRequest(proxy.ProxyRequest): protocols = {'http': WordCountProxyClientFactory} def _ _init_ _(self, wordCounter, *args): self.wordCounter = wordCounter proxy.ProxyRequest._ _init_ _(self, *args) class WordCountProxy(proxy.Proxy): def _ _init_ _(self, wordCounter): self.wordCounter = wordCounter proxy.Proxy._ _init_ _(self) def requestFactory(self, *args): return WordCountProxyRequest(self.wordCounter, *args) class WordCountProxyFactory(http.HTTPFactory): def _ _init_ _(self, wordCounter): self.wordCounter = wordCounter http.HTTPFactory._ _init_ _(self) def buildProtocol(self, addr): protocol = WordCountProxy(self.wordCounter) return protocol # classes for web reporting interface class WebReportRequest(http.Request): def _ _init_ _(self, wordCounter, *args): self.wordCounter = wordCounter http.Request._ _init_ _(self, *args) def process(self): self.setHeader("Content-Type", "text/html") words = self.wordCounter.words.items( ) words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1)) for word, count in words: self.write("