#!/usr/local/bin/perl # # CGIProxy 2.0.1 # # nph-proxy.cgi-- CGIProxy 2.0.1: a proxy in the form of a CGI script. # Retrieves the resource at any HTTP or FTP URL, updating embedded URLs # in HTML resources to point back through this script. By default, no # user info is sent to the server. Options include text-only proxying # to save bandwidth, cookie filtering, ad filtering, script removal, # user-defined encoding of the target URL, and more. Requires Perl 5. # # Copyright (C) 1996, 1998-2002 by James Marshall, james@jmarshall.com # All rights reserved. # # For the latest, see http://www.jmarshall.com/tools/cgiproxy/ # # # IMPORTANT NOTE ABOUT ANONYMOUS BROWSING: # CGIProxy was originally made for indirect browsing more than # anonymity, but since people are using it for anonymity, I've tried # to make it as anonymous as possible. Suggestions welcome. For best # anonymity, browse with JavaScript turned off. In fact, that's the # only reliable way, in spite of what certain anonymity vendors claim. # Anonymity is pretty good, but may not be bulletproof. For example, # if even a single JavaScript statement can be run, your anonymity can # be compromised. I've tried to remove JS from every place it can # exist, but please tell me if I missed any. Also, browser plugins or # other executable extensions may be able to reveal you to a server. # If you find any way your anonymity can be compromised even with scripts # turned off, please let me know. # # # CONFIGURATION: # # None required in most situations. On some servers, these might be # required (all in the "user configuration" section): # . If you're using another HTTP or SSL proxy, set $HTTP_PROXY, # $SSL_PROXY, and $NO_PROXY as needed. If those proxies use # authentication, set $PROXY_AUTH and $SSL_PROXY_AUTH accordingly. # . If this is running on an SSL server that doesn't use port 443, set # $RUNNING_ON_SSL_SERVER=1 (otherwise, the default of '' is fine). # # Options include: # . Set $TEXT_ONLY, $REMOVE_COOKIES, $REMOVE_SCRIPTS, $FILTER_ADS, # $HIDE_REFERER, and $INSERT_ENTRY_FORM as desired. Set # $REMOVE_SCRIPTS if anonymity is important. # . To let the user choose all of those settings (except $TEXT_ONLY), # set $ALLOW_USER_CONFIG=1. # . To change the encoding format of the URL, modify the # proxy_encode() and proxy_decode() routines. The default # routines are suitable for simple PATH_INFO compliance. # . To encode cookies, modify the cookie_encode() and cookie_decode() # routines. # . You can restrict which servers this proxy will access, with # @ALLOWED_SERVERS and @BANNED_SERVERS. # . Similarly, you can specify allowed and denied server lists for # both cookies and scripts. # . For security, you can ban access to private IP ranges, with # @BANNED_NETWORKS. # . If filtering ads, you can customize this with a few settings. # . To insert your own block of HTML into each page, set $INSERT_HTML # or $INSERT_FILE. # . As a last resort, if you really can't run this script as NPH, # you can try to run it as non-NPH by setting $NOT_RUNNING_AS_NPH=1. # BUT, read the notes and warnings above that line. Caveat surfor. # . For crude load-balancing among a set of proxies, set @PROXY_GROUP. # . Other config is possible; see the user configuration section. # . If heavy use of this proxy puts a load on your server, see the # "NOTES ON PERFORMANCE" section below. # # For more info, read the comments regarding any config options you set. # # This script MUST be installed as a non-parsed header (NPH) script. # In Apache and many other servers, this is done by simply starting the # filename with "nph-". It MAY be possible to fake it as a non-NPH # script, MOST of the time, by using the $NOT_RUNNING_AS_NPH feature. # This is not advised. See the comments by that option for warnings. # # # TO USE: # Start a browsing session by calling the script with no parameters. # You can bookmark pages you browse to through the proxy, or link to # the URLs that are generated. # # # NOTES ON PERFORMANCE: # Unfortunately, this has gotten slower through the versions, mostly # because of optional new features. Configured equally, version 1.3 # takes 25% longer to run than 1.0 or 1.1 (based on *cough* highly # abbreviated testing). Compiling takes about 50% longer. # Leaving $REMOVE_SCRIPTS=1 adds 25-50% to the running time. # Remember that we're talking about tenths of a second here. Most of # the delay experienced by the user is from waiting on two network # connections. These performance issues only matter if your server # CPU is getting overloaded. Also, these only matter when retrieving # HTML, because it's the HTML modification that takes all the time. # If you can, use mod_perl. Starting with version 1.3.1, this should # work under mod_perl, which requires Perl 5.004 or later. If you use # mod_perl, be careful to install this as an NPH script, i.e. set the # "PerlSendHeader Off" configuration directive. For more info, see the # mod_perl documentation. # If you use mod_perl and modify this script, see the note near the # "reset 'a-z'" line below, regarding UPPER_CASE and lower_case # variables. # # # TO DO: # What I want to hear about: # . Any HTML tags not being converted here. # . Any method of introducing JavaScript or other script, that's not # being filtered out here. # . Any script MIME types other than those already in @SCRIPT_MIME_TYPES. # . Any MIME types other than text/html that have links that need to # be converted. # # plug any other script holes (e.g. MSIE-proprietary, other MIME types?) # This could use cleaner URL-encoding all over ($base_url, etc.) # more error checking? # find a simple encryption technique for proxy_encode() # support more protocols, like mailto: or gopher: # For ad filtering, add option to disable images from servers other than # that of the containing HTML page? Is it worth it? # # # BUGS: # Anonymity may not not perfect. In particular, there may be some remaining # JavaScript holes. # URLs generated by JavaScript or similar mechanisms won't be re-proxy'ed # correctly. JavaScript in general may not work as expected. # Since ALL of your cookies are sent to this script (which then chooses # the relevant ones), some cookies could conceivably be dropped if # you accumulate a whole lot. I haven't seen this happen yet. # # # I first wrote this in 1996 as an experiment to allow indirect browsing. # The original seed was a program I wrote for Rich Morin's article # in the June 1996 issue of Unix Review, online at # http://www.cfcl.com/tin/P/199606.shtml. # # Confession: I didn't originally write this with the spec for HTTP # proxies in mind, and there are probably some violations of the protocol # (at least for proxies). This whole thing is one big violation of the # proxy model anyway, so I hereby rationalize that the spec can be widely # interpreted here. If there is demand, I can make it more conformant. # The HTTP client and server components should be fine; it's just the # special requirements for proxies that may not be followed. # #-------------------------------------------------------------------------- use strict ; use Socket ; # First block below is config variables, second block is sort-of config # variables, third block is persistent constants, fourth block is would-be # persistent constants (not set until needed), and last block is variables. use vars qw( $TEXT_ONLY $REMOVE_COOKIES $REMOVE_SCRIPTS $FILTER_ADS $HIDE_REFERER $INSERT_ENTRY_FORM $ALLOW_USER_CONFIG @ALLOWED_SERVERS @BANNED_SERVERS @BANNED_NETWORKS $NO_COOKIE_WITH_IMAGE @ALLOWED_COOKIE_SERVERS @BANNED_COOKIE_SERVERS @ALLOWED_SCRIPT_SERVERS @BANNED_SCRIPT_SERVERS @BANNED_IMAGE_URL_PATTERNS $RETURN_EMPTY_GIF $INSERT_HTML $INSERT_FILE $ANONYMIZE_INSERTION $FORM_AFTER_INSERTION $INSERTION_FRAME_HEIGHT $RUNNING_ON_SSL_SERVER $NOT_RUNNING_AS_NPH $HTTP_PROXY $SSL_PROXY $NO_PROXY $PROXY_AUTH $SSL_PROXY_AUTH $MINIMIZE_CACHING $SESSION_COOKIES_ONLY @PROXY_GROUP $USER_AGENT $USE_PASSIVE_FTP_MODE $SHOW_FTP_WELCOME $USE_POST_ON_START $REMOVE_TITLES $NO_BROWSE_THROUGH_SELF $NO_LINK_TO_START $MAX_REQUEST_SIZE $QUIETLY_EXIT_PROXY_SESSION $OVERRIDE_SECURITY $PROXIFY_SCRIPTS $PROXIFY_COMMENTS @SCRIPT_MIME_TYPES @OTHER_TYPES_TO_REGISTER @TYPES_TO_HANDLE $NON_TEXT_EXTENSIONS $PROXY_VERSION @MONTH @WEEKDAY %UN_MONTH @BANNED_NETWORK_ADDRS $RUNNING_ON_IIS @NO_PROXY $NO_CACHE_HEADERS @ALL_TYPES %MIME_TYPE_ID $SCRIPT_TYPE_REGEX $TYPES_TO_HANDLE_REGEX $THIS_HOST $ENV_SERVER_PORT $ENV_SCRIPT_NAME $THIS_SCRIPT_URL $HAS_BEGUN $CUSTOM_INSERTION $HTTP_VERSION $HTTP_1_X $URL $now $packed_flags $encoded_URL $doing_insert_here $env_accept $e_remove_cookies $e_remove_scripts $e_filter_ads $e_insert_entry_form $e_hide_referer $images_are_banned_here $scripts_are_banned_here $cookies_are_banned_here $scheme $authority $path $host $port $username $password $cookie_to_server %auth $script_url $url_start $url_start_inframe $url_start_noframe $is_in_frame $expected_type $base_url $base_scheme $base_host $base_path $base_unframes $default_style_type $default_script_type $status $headers $body $is_html $response_sent $debug ) ; # Under mod_perl, persistent constants only need to be initialized once, so # use this one-time block to do so. unless ($HAS_BEGUN) { #-------------------------------------------------------------------------- # user configuration #-------------------------------------------------------------------------- # If set, then proxy traffic will be restricted to text data only, to save # bandwidth (though it can still be circumvented with uuencode, etc.). $TEXT_ONLY= 0 ; # set to 1 to allow only text data, 0 to allow all # If set, then prevent all cookies from passing through the proxy. To allow # cookies from some servers, set this to 0 and see @ALLOWED_COOKIE_SERVERS # and @BANNED_COOKIE_SERVERS below. You can also prevent cookies with # images by setting $NO_COOKIE_WITH_IMAGE below. # Note that this only affects cookies from the target server. The proxy # script sends its own cookies for other reasons too, like to support # authentication. This flag does not stop these cookies from being sent. $REMOVE_COOKIES= 0 ; # If set, then remove as much scripting as possible. If anonymity is # important, this is strongly recommended! Better yet, turn off script # support in your browser. # On the HTTP level: # . prevent transmission of script MIME types (which only works if the server # marks them as such, so a malicious server could get around this, but # then the browser probably wouldn't execute the script). # . remove Link: headers that link to a resource of a script MIME type. # Within HTML resources: # . remove . # . remove intrinsic event attributes from tags, i.e. attributes whose names # begin with "on". # . remove where "type" attribute is a script MIME type. # . remove various HTML tags that appear to link to a script MIME type. # . remove script macros (aka Netscape-specific "JavaScript entities"), # i.e. any attributes containing the string "&{" . # . remove "JavaScript conditional comments". # . remove MSIE-specific "dynamic properties". # To allow scripts from some sites but not from others, set this to 0 and # see @ALLOWED_SCRIPT_SERVERS and @BANNED_SCRIPT_SERVERS below. # See @SCRIPT_MIME_TYPES below for a list of which MIME types are filtered out. # I do NOT know for certain that this removes all script content! It removes # all that I know of, but I don't have a definitive list of places scripts # can exist. If you do, please send it to me. EVEN RUNNING A SINGLE # JAVASCRIPT STATEMENT CAN COMPROMISE YOUR ANONYMITY! Just so you know. # Richard Smith has a good test site for anonymizing proxies, at # http://users.rcn.com/rms2000/anon/test.htm # Note that turning this on removes most popup ads! :) $REMOVE_SCRIPTS= 1 ; # If set, then filter out images that match one of @BANNED_IMAGE_URL_PATTERNS, # below. Also removes cookies attached to images, as if $NO_COOKIE_WITH_IMAGE # is set. # To remove most popup advertisements, also set $REMOVE_SCRIPTS=1 above. $FILTER_ADS= 0 ; # If set, then don't send a Referer: [sic] header with each request # (i.e. something that tells the server which page you're coming from # that linked to it). This is a minor privacy issue, but a few sites # won't send you pages or images if the Referer: is not what they're # expecting. If a page is loading without images or a link seems to be # refused, then try turning this off, and a correct Referer: header will # be sent. # This is only a problem in a VERY small percentage of sites, so few that # I'm kinda hesitant to put this in the entry form. Other arrangements # have their own problems, though. $HIDE_REFERER= 1 ; # If set, insert a compact version of the URL entry form at the top of each # page. This will also display the URL currently being viewed. # When viewing a page with frames, then a new top frame is created and the # insertion goes there. # If you want to customize the appearance of the form, modify the routine # mini_start_form() near the end of the script. # If you want to insert something other than this form, see $INSERT_HTML and # $INSERT_FILE below. # Users should realize that options changed via the form only take affect when # the form is submitted by entering a new URL or pressing the "Go" button. # Selecting an option, then following a link on the page, will not cause # the option to take effect. # Users should also realize that anything inserted into a page may throw # off any precise layout. The insertion will also be subject to # background colors and images, and any other page-wide settings. $INSERT_ENTRY_FORM= 1 ; # If set, then allow the user to control $REMOVE_COOKIES, $REMOVE_SCRIPTS, # $FILTER_ADS, $HIDE_REFERER, and $INSERT_ENTRY_FORM. Note that they # can't fine-tune any related options, such as the various @ALLOWED... and # @BANNED... lists. $ALLOW_USER_CONFIG= 1 ; # Create your own proxy_encode() and proxy_decode() to tranform the target # URL to and from the format that will be stored in PATH_INFO. The encoded # form should only contain characters that are legal in PATH_INFO. This # varies by server, but using only printable chars, no "?" or "#", and no # two adjacent slashes ("//") works on most servers. Don't let PATH_INFO # contain the strings "./", "/.", "../", or "/..", or else it may get # compressed like a pathname somewhere. Try not to make the resulting # string too long, either. # Of course, proxy_decode() must exactly undo whatever proxy_encode() does. # Make proxy_encode() as fast as possible-- it's a major bottleneck for the # whole program. # Because of the simplified absolute URL resolution in full_url(), there may # be ".." segments in the default encoding here, notably in the first path # segment. Normally, that's just an HTML mistake, but please tell me if # you see any privacy exploit with it. # Note that a few sites have embedded applications (like applets or Shockwave) # that expect to access URLs relative to the page's URL. This means they # may not work if the encoded target URL can't be treated like a base URL, # e.g. that it can't be appended with something like "../data/foo.data" # to get that expected data file. In such cases, the default encoding below # should let these sites work fine, as should any other encoding that can # support URLs relative to it. sub proxy_encode { my($URL)= @_ ; $URL=~ s#^([\w+.-]+)://#$1/# ; # http://xxx -> http/xxx # $URL=~ s/(.)/ sprintf('%02x',ord($1)) /ge ; # each char -> 2-hex # $URL=~ tr/a-zA-Z/n-za-mN-ZA-M/ ; # rot-13 return $URL ; } sub proxy_decode { my($enc_URL)= @_ ; # $enc_URL=~ tr/a-zA-Z/n-za-mN-ZA-M/ ; # rot-13 # $enc_URL=~ s/([0-9A-Fa-f]{2})/ sprintf("%c",hex($1)) /ge ; $enc_URL=~ s#^([\w+.-]+)/#$1://# ; # http/xxx -> http://xxx return $enc_URL ; } # Encode cookies before they're sent back to the user. # The return value must only contain characters that are legal in cookie # names and values, i.e. only printable characters, and no ";", ",", "=", # or white space. # cookie_encode() is called twice for each cookie: once to encode the cookie # name, and once to encode the cookie value. The two are then joined with # "=" and sent to the user. # cookie_decode() must exactly undo whatever cookie_encode() does. # Also, cookie_encode() must always encode a given input string into the # same output string. This is because browsers need the cookie name to # identify and manage a cookie, so the name must be consistent. # This is not a bottleneck like proxy_encode() is, so speed is not critical. sub cookie_encode { my($cookie)= @_ ; # $cookie=~ s/(.)/ sprintf('%02x',ord($1)) /ge ; # each char -> 2-hex # $cookie=~ tr/a-zA-Z/n-za-mN-ZA-M/ ; # rot-13 $cookie=~ s/(\W)/ '%' . sprintf('%02x',ord($1)) /ge ; # simple URL-encoding return $cookie ; } sub cookie_decode { my($enc_cookie)= @_ ; $enc_cookie=~ s/%([\da-fA-F]{2})/ pack('C', hex($1)) /ge ; # URL-decode # $enc_cookie=~ tr/a-zA-Z/n-za-mN-ZA-M/ ; # rot-13 # $enc_cookie=~ s/([0-9A-Fa-f]{2})/ sprintf("%c",hex($1)) /ge ; return $enc_cookie ; } # Use @ALLOWED_SERVERS and @BANNED_SERVERS to restrict which servers a user # can visit through this proxy. Any URL at a host matching a pattern in # @BANNED_SERVERS will be forbidden. In addition, if @ALLOWED_SERVERS is # not empty, then access is allowed *only* to servers that match a pattern # in it. In other words, @BANNED_SERVERS means "ban these servers", and # @ALLOWED_SERVERS (if not empty) means "allow only these servers". If a # server matches both lists, it is banned. # These are each a list of Perl 5 regular expressions (aka patterns or # regexes), not literal host names. To turn a hostname into a pattern, # replace every "." with "\.", add "^" to the beginning, and add "$" to the # end. For example, "www.example.com" becomes "^www\.example\.com$". To # match *every* host ending in something, leave out the "^". For example, # "\.example\.com$" matches every host ending in ".example.com". For more # details about Perl regular expressions, see the Perl documentation. (They # may seem cryptic at first, but they're very powerful once you know how to # use them.) @ALLOWED_SERVERS= () ; @BANNED_SERVERS= () ; # If @BANNED_NETWORKS is set, then forbid access to these hosts or networks. # This is done by IP address, not name, so it provides more certain security # than @BANNED_SERVERS above. # Specify each element as a decimal IP address-- all four integers for a host, # or one to three integers for a network. For example, '127.0.0.1' bans # access to the local host, and '192.168' bans access to all IP addresses # in the 192.168 network. Sorry, no banning yet for subnets other than # 8, 16, or 24 bits. # IF YOU'RE RUNNING THIS ON OR INSIDE A FIREWALL, THIS SETTING IS STRONGLY # RECOMMENDED!! In particular, you should ban access to other machines # inside the firewall that the firewall machine itself may have access to. # Otherwise, external users will be able to access any internal hosts that # the firewall can access. Even if that's what you intend, you should ban # access to any hosts that you don't explicitly want to expose to outside # users. # In addition to the recommended defaults below, add all IP addresses of your # server machine if you want to protect it like this. # After you set this, YOU SHOULD TEST to verify that the proxy can't access # the IP addresses you're banning! # This feature is simple now but will be more complete in future releases. # How would you like this to be extended? What would be useful to you? @BANNED_NETWORKS= ('127.0.0.1', '192.168', '10') ; # Settings to fine-tune cookie filtering, if cookies are not banned altogether # (by user checkbox or $REMOVE_COOKIES above). # Use @ALLOWED_COOKIE_SERVERS and @BANNED_COOKIE_SERVERS to restrict which # servers can send cookies through this proxy. They work like # @ALLOWED_SERVERS and @BANNED_SERVERS above, both in how their precedence # works, and that they're lists of Perl 5 regular expressions. See the # comments there for details. # If non-empty, only allow cookies from servers matching one of these patterns. # Comment this out to allow all cookies (subject to @BANNED_COOKIE_SERVERS). #@ALLOWED_COOKIE_SERVERS= ('\bslashdot\.org$') ; # Reject cookies from servers matching these patterns. @BANNED_COOKIE_SERVERS= ( '\.doubleclick\.net$', '\.preferences\.com$', '\.imgis\.com$', '\.adforce\.com$', '\.focalink\.com$', '\.flycast\.com$', '\.go\.com$', '\.avenuea\.com$', '\.linkexchange\.com$', '\.pathfinder\.com$', '\.burstnet\.com$', '\btripod\.com$', '\bgeocities\.yahoo\.com$', '\.mediaplex\.com$', ) ; # Set this to reject cookies returned with images. This actually prevents # cookies returned with any non-text resource. $NO_COOKIE_WITH_IMAGE= 1 ; # Settings to fine-tune script filtering, if scripts are not banned altogether # (by user checkbox or $REMOVE_SCRIPTS above). # Use @ALLOWED_SCRIPT_SERVERS and @BANNED_SCRIPT_SERVERS to restrict which # servers you'll allow scripts from. They work like @ALLOWED_SERVERS and # @BANNED_SERVERS above, both in how their precedence works, and that # they're lists of Perl 5 regular expressions. See the comments there for # details. @ALLOWED_SCRIPT_SERVERS= () ; @BANNED_SCRIPT_SERVERS= () ; # Various options to help filter ads and stop cookie-based privacy invasion. # These are only effective if $FILTER_ADS is set above. # @BANNED_IMAGE_URL_PATTERNS uses Perl patterns. If an image's URL # matches one of the patterns, it will not be downloaded (typically for # ad-filtering). For more information on Perl regular expressions, see # the Perl documentation. # Note that most popup ads will be removed if scripts are removed (see # $REMOVE_SCRIPTS above). # If ad-filtering is your primary motive, consider using one of the many # proxies that specialize in that. The classic is from JunkBusters, at # http://www.junkbusters.com . # Reject images whose URL matches any of these patterns. This is just a # sample list; add more depending on which sites you visit. @BANNED_IMAGE_URL_PATTERNS= ( 'ad\.doubleclick\.net/ad/', '\b[a-z](\d+)?\.doubleclick\.net(:\d*)?/', '\.imgis\.com\b', '\.adforce\.com\b', '\.avenuea\.com\b', '\.go\.com(:\d*)?/ad/', '\.eimg\.com\b', '\bexcite\.netscape\.com(:\d*)?/.*/promo/', '/excitenetscapepromos/', '\.yimg\.com(:\d*)?.*/promo/', '\bus\.yimg\.com/[a-z]/(\w\w)/\1', '\bus\.yimg\.com/[a-z]/\d-/', '\bpromotions\.yahoo\.com(:\d*)?/promotions/', '\bcnn\.com(:\d*)?/ads/', 'ads\.msn\.com\b', '\blinkexchange\.com\b', '\badknowledge\.com\b', '/SmartBanner/', '\bdeja\.com/ads/', '\bimage\.pathfinder\.com/sponsors', 'ads\.tripod\.com', 'ar\.atwola\.com/image/', '\brealcities\.com/ads/', '\bnytimes\.com/ad[sx]/', '\busatoday\.com/sponsors/', '\busatoday\.com/RealMedia/ads/', '\bmsads\.net/ads/', '\batdmt\.com/[a-z]/', ) ; # If set, replace banned images with 1x1 transparent GIF. $RETURN_EMPTY_GIF= 1 ; # If either $INSERT_HTML or $INSERT_FILE is set, then that HTML text or the # contents of that named file (respectively) will be inserted into any HTML # page retrieved through this proxy. $INSERT_HTML takes precedence over # $INSERT_FILE. # When viewing a page with frames, a new top frame is created and the # insertions go there. # NOTE: Any HTML you insert should not have relative URLs in it! The problem # is that there is no appropriate base URL to resolve them with. So only use # absolute URLs in your insertion. (If you use relative URLs anyway, then # a) if $ANONYMIZE_INSERTION is set, they'll be resolved relative to this # script's URL, which isn't great, or b) if $ANONYMIZE_INSERTION==0, # they'll be unchanged and the browser will simply resolve them relative # to the current page, which is usually worse.) # The frame handling means that it's fairly easy for a surfer to bypass this # insertion, by pretending in effect to be in a frame. There's not much we # can do about that, since a page is retrieved the same way regardless of # whether it's in a frame. This script uses a parameter in the URL to # communicate to itself between calls, but the user can merely change that # URL to make the script think it's retrieving a page for a frame. Also, # many browsers let the user expand a frame's contents into a full window. # [The warning in earlier versions about setting $INSERT_HTML to '' when using # mod_perl and $INSERT_FILE no longer applies. It's all handled elsewhere.] # As with $INSERT_ENTRY_FORM, note that any insertion may throw off any # precise layout, and the insertion is subject to background colors and # other page-wide settings. #$INSERT_HTML= "

This is an inserted header


" ; #$INSERT_FILE= 'insert_file_name' ; # If your insertion has links that you want anonymized along with the rest # of the downloaded HTML, then set this to 1. Otherwise leave it at 0. $ANONYMIZE_INSERTION= 0 ; # If there's both a URL entry form and an insertion via $INSERT_HTML or # $INSERT_FILE on the same page, the entry form normally goes at the top. # Set this to put it after the other insertion. $FORM_AFTER_INSERTION= 0 ; # If the insertion is put in a top frame, then this is how many pixels high # the frame is. If the default of 80 or 50 pixels is too big or too small # for your insertion, change this. You can use percentage of screen height # if you prefer, e.g. "20%". (Unfortunately, you can't just tell the # browser to "make it as high as it needs to be", but at least the frame # will be resizable by the user.) # This affects insertions by $INSERT_ENTRY_FORM, $INSERT_HTML, and $INSERT_FILE. # The default here usually works for the inserted entry form, which varies in # size depending on $ALLOW_USER_CONFIG. It also varies by browser. $INSERTION_FRAME_HEIGHT= $ALLOW_USER_CONFIG ? 80 : 50 ; # Set this to 1 if the script is running on an SSL server, i.e. it is # accessed through a URL starting with "https:"; set this to 0 if it's not # running on an SSL server. This is needed to know how to route URLs back # through the proxy. Regrettably, standard CGI does not yet provide a way # for scripts to determine this without help. # If this variable is set to '' or left undefined, then the program will # guess: SSL is assumed if and only if SERVER_PORT is 443. This fails # if SSL is used on another port, or (less commonly) a non-SSL server uses # port 443, but usually it works. Besides being a good default, it lets # you install the script where both a secure server and a non-secure server # will serve it, and it will work correctly through either server. # This has nothing to do with retrieving pages that are on SSL servers. $RUNNING_ON_SSL_SERVER= '' ; # If your server doesn't support NPH scripts, then set this variable to true # and try running the script as a normal non-NPH script. HOWEVER, this # won't work as well as running it as NPH; there may be bugs, maybe some # privacy holes, and results may not be consistent. It's a hack. # Try to install the script as NPH before you use this option, because # this may not work. NPH is supported on almost all servers, and it's # usually very easy to install a script as NPH (on Apache, for example, # you just need to name the script something starting with "nph-"). # One example of a problem is that Location: headers may get messed up, # because they mean different things in an NPH and a non-NPH script. # You have been warned. # For this to work, your server MUST support the "Status:" CGI response # header. $NOT_RUNNING_AS_NPH= 0 ; # Set HTTP and SSL proxies if needed. Also see $USE_PASSIVE_FTP_MODE below. # The format of the first two variables is "host:port", with the port being # optional. The format of $NO_PROXY is a comma-separated list of hostnames # or domains: any request for a hostname that ends in one of the strings in # $NO_PROXY will not use the HTTP or SSL proxy; e.g. use ".mycompany.com" to # avoid using the proxies to access any host in the mycompany.com domain. # The environment variables in the examples below are appropriate defaults, # if they are available. Note that earlier versions of this script used # the environment variables directly, instead of the $HTTP_PROXY and # $NO_PROXY variables we use now. # Sometimes you can use the same proxy (like Squid) for both SSL and normal # HTTP, in which case $HTTP_PROXY and $SSL_PROXY will be the same. # $NO_PROXY applies to both SSL and normal HTTP proxying, which is usually # appropriate. If there's demand to differentiate those, it wouldn't be # hard to make a separate $SSL_NO_PROXY option. #$HTTP_PROXY= $ENV{'http_proxy'} ; #$SSL_PROXY= 'firewall.example.com:3128' ; #$NO_PROXY= $ENV{'no_proxy'} ; # If your HTTP and SSL proxies require authentication, this script supports # that in a limited way: you can have a single username/password pair per # proxy to authenticate with, regardless of realm. In other words, multiple # realms aren't supported for proxy authentication (though they are for # normal server authentication, elsewhere). # Set $PROXY_AUTH and $SSL_PROXY_AUTH either in the form of "username:password", # or to the actual base64 string that gets sent in the Proxy-Authorization: # header. Often the two variables will be the same, when the same proxy is # used for both SSL and normal HTTP. #$PROXY_AUTH= 'Aladdin:open sesame' ; #$SSL_PROXY_AUTH= $PROXY_AUTH ; # Here's an experimental feature that may or may not be useful. It's trivial # to add, so I added it. It was inspired in part by Mike Reiter's and Avi # Rubin's "Crowds", at http://www.research.att.com/projects/crowds/ . # Let me know if you find a use for it. # The idea is that you have a number of mutually-trusting, cooperating # proxies that you list in @PROXY_GROUP(). If that is set, then instead # of rerouting all URLs back through this proxy, the script will choose # one of these proxies at random to reroute all URLs through, for each # run. This could be used to balance the load among several proxies, for # example. Under certain conditions it could conceivably help privacy by # making it harder to track a user's session, but under certain other # conditions it could make it easier, depending on how many people, # proxies, and proxy servers are involved. For each page, both its # included images and followed links will go through the same proxy, so a # clever target server could determine which proxy servers are in each # group. # proxy_encode() and proxy_decode() must be the same for all proxies in the # group. Same goes for pack_flags() and unpack_flags() if you modified them, # and probably certain other routines and configuration options. # Cookies and Basic authentication can't be supported with this, sorry, since # cookies can only be sent back to the proxy that created them. # Set this to a list of absolute URLs of proxies, ending with "nph-proxy.cgi" # (or whatever you named the script). Be sure to include the URL of this # proxy, or it will never redirect back through here. Each proxy in the # group should have the same @PROXY_GROUP. # Alternately, you could set each proxy's @PROXY_GROUP differently for more # creative configuration, such as to balance the load unevenly, or to send # users through a "round-robin" cycle of proxies. #@PROXY_GROUP= ('http://www.example.com/~grommit/proxy/nph-proxy.cgi', # 'http://www.fnord.mil/langley/bavaria/atlantis/nph-proxy.cgi', # 'http://www.nothinghere.gov/No/Such/Agency/nph-proxy.cgi', # ) ; # Normally, your browser stores all pages you download in your computer's # hard drive and memory, in the "cache". This saves a lot of time and # bandwidth the next time you view the page (especially with images, which # are bigger and may be shared among several pages). However, in some # situations you may not want the pages you've visited to be stored. If # $MINIMIZE_CACHING is set, then this proxy will try its best to prevent any # caching of anything retrieved through it. # NOTE: This cannot guarantee that no caching will happen. All we can do is # instruct the browser not to cache anything. A faulty or malicious browser # could cache things anyway if it chose to. # NOTE: This has nothing to do with your browser's "history list", which may # also store a list of URLs you've visited. # NOTE: If you use this, you will use a lot more bandwidth than without it, # and pages will seemingly load slower, because if a browser can't cache # anything locally then it has to load everything across the network every # time it needs something. $MINIMIZE_CACHING= 0 ; # Normally, each cookie includes an expiration time/date, and the cookie stays # in effect until then, even after you exit your browser and restart it # (which normally means the cookie is stored on the hard drive). Any cookie # that has no explicit expiration date is a "session cookie", and stays in # effect only as long as the browser is running, and presumably is forgotten # after that. If you set $SESSION_COOKIES_ONLY=1, then *all* cookies that # pass through this proxy will be changed to session cookies. This is useful # at a public terminal, or wherever you don't want your cookies to remain # after you exit the browser. # NOTE: The clock on the server where this runs must be correct for this # option to work right! It doesn't have to be exact, but don't have it off # by hours or anything like that. The problem is that we must not alter any # cookies set to expire in the past, because that's how sites delete cookies. # If a cookie is being deleted, we DON'T want to turn it into a session # cookie. So this script will not alter any cookies set to expire before the # current time according to the system clock. $SESSION_COOKIES_ONLY= 0 ; # Set $USER_AGENT to something generic like this if you want to be extra # careful. Conceivably, revealing which browser you're using may be a # slight privacy or security risk. # However, note that some URLs serve different pages depending on which # browser you're using, so some pages will change if you set this. # This defaults to the user's HTTP_USER_AGENT. #$USER_AGENT= 'Mozilla/4.05 [en] (X11; I; Linux 2.0.34 i586)' ; # FTP transfers can happen in either passive or non-passive mode. Passive # mode works better if the client (this script) is behind a firewall. Some # people consider passive mode to be more secure, too. But in certain # network configurations, if this script has trouble connecting to FTP # servers, you can turn this off to try non-passive mode. # See http://cr.yp.to/ftp/security.html for a discussion of security issues # regarding passive and non-passive FTP. $USE_PASSIVE_FTP_MODE= 1 ; # Unlike a normal browser which can keep an FTP session open between requests, # this script must make a new connection with each request. Thus, the # FTP welcome message (e.g. the README file) will be received every time; # there's no way for this script to know if you've been here before. Set # $SHOW_FTP_WELCOME to true to always show the welcome message, or false # to never show it. $SHOW_FTP_WELCOME= 1 ; # Apparently, some censoring filters search outgoing request URIs, but not # POST request bodies. Set this to make the initial input form submit # using POST instead of GET. $USE_POST_ON_START= 1 ; # Apparently, some censoring filters look at titles on HTML pages. Set this # to remove HTML page titles. $REMOVE_TITLES= 0 ; # If set, this option prevents a user from calling the proxy through the # proxy itself, i.e. looping. It's normally a mistake on the user's part, # and a waste of resources. # This isn't foolproof; it just catches the obvious mistakes. It's probably # pretty easy for a malicious user to make the script call itself, or s/he # can always use two proxies to call each other in a loop. This doesn't # account for IP addresses or multiple hostnames for the same server. $NO_BROWSE_THROUGH_SELF= 0 ; # Set this to leave out the "Restart" link at the bottom of error pages, etc. # In some situations this could make it harder for search engines to find the # start page. $NO_LINK_TO_START= 0 ; # For the obscure case when a POST must be repeated because of user # authentication, this is the max size of the request body that this # script will store locally. If CONTENT_LENGTH is bigger than this, # the body's not saved at all-- the first POST will be correct, but # the second will not happen at all (since a partial POST is worse than # nothing). $MAX_REQUEST_SIZE= 4194304 ; # that's 4 Meg to you and me # Normally, if a user tries to access a banned server or use an unsupported # scheme (protocol), this script will alert the user with a warning page, and # either allow the user to click through to the URL unprotected (i.e. without # using the proxy), or ban access altogether. However, in some VPN-like # installations, it may more desirable to let users follow links from # protected pages (e.g. within an intranet) that lead to unprotected, # unproxified pages (e.g. pages outside of the intranet), with no breaks in # the browsing experience. (This example assumes the proxy owner intends it # to be used for browsing only the intranet and not the Internet at large.) # Set $QUIETLY_EXIT_PROXY_SESSION to skip any warning message and let the # user surf directly to unproxified pages from proxified pages. Note that # this somewhat changes the meaning of @ALLOWED_SERVERS and @BANNED_SERVERS-- # they're not allowed or banned per se, it's just whether this proxy is # willing to handle their traffic. @BANNED_NETWORKS is unaffected, however, # since the IP ranges it contains often make no sense outside of the LAN. # WARNING: DO *NOT* SET THIS FLAG IF ANONYMITY IS IMPORTANT AT ALL!!! IT IS # NOT MEANT FOR THAT KIND OF INSTALLATION. IF THIS IS SET, THEN USERS WILL # SURF INTO UNPROXIFIED, UNANONYMIZED PAGES WITH NO WARNING, AND THEIR # PRIVACY WILL BE COMPROMISED; THEY MAY NOT EVEN NOTICE FOR A LONG TIME. # THIS IS EXACTLY WHAT ANONYMIZING PROXIES ARE CREATED TO AVOID. $QUIETLY_EXIT_PROXY_SESSION= 0 ; # WARNING: # EXCEPT UNDER RARE CIRCUMSTANCES, ANY PROXY WHICH HANDLES SSL REQUESTS # SHOULD *ONLY* RUN ON AN SSL SERVER!!! OTHERWISE, YOU'RE RETRIEVING # PROTECTED PAGES BUT SENDING THEM BACK TO THE USER UNPROTECTED. THIS # COULD EXPOSE ANY INFORMATION IN THOSE PAGES, OR ANY INFORMATION THE # USER SUBMITS TO A SECURE SERVER. THIS COULD HAVE SERIOUS CONSEQUENCES, # EVEN LEGAL CONSEQUENCES. IT UNDERMINES THE WHOLE PURPOSE OF SECURE # SERVERS. # THE *ONLY* EXCEPTION IS WHEN YOU HAVE *COMPLETE* TRUST OF THE LINK # BETWEEN THE BROWSER AND THE SERVER THAT RUNS THE SSL-HANDLING PROXY, # SUCH AS ON A CLOSED LAN, OR IF THE PROXY RUNS ON THE SAME MACHINE AS # THE BROWSER. # IF YOU ARE ABSOLUTELY SURE THAT YOU YOU TRUST THE USER-TO-PROXY LINK, YOU # CAN OVERRIDE THE AUTOMATIC SECURITY MEASURE BY SETTING THE FLAG BELOW. # CONSIDER THE CONSEQUENCES VERY CAREFULLY BEFORE YOU RUN THIS SSL-ACCESSING # PROXY ON AN INSECURE SERVER!!! $OVERRIDE_SECURITY= 0 ; # Stuff below here you probably shouldn't modify unless you're messing with # the code. # The framework is in place to modify script content to pass back through the # proxy, though the actual code that modifies a single script block of a # given type are not done. If you want to, say, modify JavaScript in # certain ways that work for your purpose, then see the routine # proxify_block(). If you set this $PROXIFY_SCRIPTS flag to true, then # proxify_block() will be called for every piece of script that comes # through this proxy. # So, to modify script content like this: a) set this flag to true, and b) go # write some code in proxify_block() that modifies the script content the # way you want. You probably want to use the routine full_url(); go read # what it does. Also see @TYPES_TO_HANDLE and @SCRIPT_MIME_TYPES below. # Don't set this unless you actually do that programming. Without any added # code, it won't do anything but slow down the program-- dealing with the # script-modifying framework takes longer than merely removing scripts, and # both take a lot longer than leaving scripts intact. # Limited testing shows this adds 20-30% to the running time for script-heavy # sites, and very little for script-free sites. However, this number varies # greatly from page to page. This is only the overhead involved in # separating out the script content to call proxify_block(); this does not # include anything that is actually done in that routine. # NOTE: This is still experimental. The framework should work fine, but what # goes in proxify_block() is up to you. # NOTE TOO: You will almost certainly not be able to anonymize JavaScript # completely. It's not hard to do "mostly", but it turns out to be a very # complex problem to do completely; there will almost certainly be exploits # that a malicious server can use to get a user's identity. The purpose of # this feature is more to allow scripts to function through the proxy, than # to provide bulletproof anonymity. You may be able to get better anonymity # if you remove certain script statements altogether rather than try to # modify them, and accept that doing so may break a few scripts. # The best advice remains: FOR BEST ANONYMITY, BROWSE WITH SCRIPTS TURNED OFF. $PROXIFY_SCRIPTS= 0 ; # Comments may contain HTML in them, which shouldn't be rendered but may be # relevant in some other way. Set this flag if you want the contents of # comments to be proxified like the rest of the page, i.e. proxify URLs, # stylesheets, scripts, etc. $PROXIFY_COMMENTS= 0 ; # This lists all MIME types that could identify a script, and which will be # filtered out as well as possible if removing scripts: HTTP responses with # Content-Type: set to one of these will be nixed, certain HTML which links # to one of these types will be removed, style sheets with a type here will # be removed, and other odds and ends. # These are used in matching, so can't contain special regex characters. # This list is also used for the the experimental $PROXIFY_SCRIPTS function. # This list contains all script MIME types I know of, but I can't guarantee # it's a complete list. It's largely taken from the examples at # http://www.robinlionheart.com/stds/html4/scripts.html # That page describes only the first four below as valid. # The page at ftp://ftp.isi.edu/in-notes/iana/assignments/media-types/media-types # lists all media (MIME) types registered with the IANA, but unfortunately # many script types (especially proprietary ones) have not registered with # them, and that list doesn't specify which types are script content anyway. @SCRIPT_MIME_TYPES= ('application/x-javascript', 'application/x-ecmascript', 'application/x-vbscript', 'application/x-perlscript', 'application/javascript', 'application/ecmascript', 'text/javascript', 'text/ecmascript', 'text/jscript', 'text/livescript', 'text/vbscript', 'text/vbs', 'text/perlscript', 'text/tcl', 'text/x-scriptlet', 'text/scriptlet', 'application/hta', ) ; # All MIME types in @SCRIPT_MIME_TYPES and @OTHER_TYPES_TO_REGISTER will be # "registered". Registration helps the script remember which MIME type is # expected by a page when downloading embedded URLs, e.g. style sheets. Any # MIME types that need special treatment should be listed here if they're not # already in @SCRIPT_MIME_TYPES. # If you write a handler for a new MIME type in proxify_block(), and that type # isn't already listed in @SCRIPT_MIME_TYPES, then add it here. @OTHER_TYPES_TO_REGISTER= ('text/css') ; # These are MIME types that we *may* try to rewrite in proxify_block(), e.g. # to send all URLs back through this script. If a type isn't on this list, # then we know for certain it should be sent back to the user unchanged, # which saves time. # If you write a handler for a new MIME type in proxify_block(), then add the # type here. # NOT all the types here are actually supported at this time! # text/html is not on this list because currently it's handled specially. @TYPES_TO_HANDLE= ('text/css', 'application/x-javascript', 'application/x-ecmascript', 'application/javascript', 'application/ecmascript', 'text/javascript', 'text/ecmascript', 'text/livescript', 'text/jscript', ) ; # This is a list of all file extensions that will be disallowed if # $TEXT_ONLY is set. It's an inexact science. If you want to ban # other file extensions, you can add more to this list. Note that # removing extensions from this list won't necessarily allow those # files through, since there are other ways $TEXT_ONLY is implemented, # such as only allowing MIME types of text/* . # The format of this list is one long string, with the extensions # separated by "|". This is because the string is actually used as # a regular expression. Don't worry if you don't know what that means. # Extensions are roughly taken from Netscape's "Helper Preferences" screen # (but that was in 1996). A more complete list might be made from a # mime.types file. $NON_TEXT_EXTENSIONS= 'gif|jpeg|jpe|jpg|tiff|tif|png|bmp|xbm' # images . '|mp2|mp3|wav|aif|aiff|au|snd' # audios . '|avi|qt|mov|mpeg|mpg|mpe' # videos . '|gz|Z|exe|gtar|tar|zip|sit|hqx|pdf' # applications . '|ram|rm|ra|swf' ; # others # This is now set directly in footer(), the only place it's used. # $PROXY_VERSION= '2.0.1' ; #-------------------------------------------------------------------------- # End of normal user configuration. # Now, set or adjust all globals that remain constant for all runs. #-------------------------------------------------------------------------- # First, set various constants. # These are used in rfc1123_date() and date_is_after(). @MONTH= qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec) ; @WEEKDAY= qw(Sun Mon Tue Wed Thu Fri Sat Sun) ; %UN_MONTH= map { lc($MONTH[$_]), $_ } 0..$#MONTH ; # look up by month name # Next, make copies of any constant environment variables, and fix as needed. # SERVER_PORT and SCRIPT_NAME will be constant, and are used in several places. # Besides, we need SCRIPT_NAME fixed before setting $THIS_SCRIPT_URL. # SCRIPT_NAME should have a leading slash, but the old CGI "standard" from # NCSA was unclear on that, so some servers didn't give it a leading # slash. Here we ensure it has a leading slash. $ENV_SERVER_PORT= $ENV{'SERVER_PORT'} ; $ENV_SCRIPT_NAME= $ENV{'SCRIPT_NAME'} ; $ENV_SCRIPT_NAME=~ s#^/?#/# ; # Next, adjust config variables as needed, or create any needed constants from # them. # Create @BANNED_NETWORK_ADDRS from @BANNED_NETWORKS. # No error checking; assumes the proxy owner set @BANNED_NETWORKS correctly. @BANNED_NETWORK_ADDRS= () ; for (@BANNED_NETWORKS) { push(@BANNED_NETWORK_ADDRS, pack('C*', /(\d+)/g)) ; } # If $RUNNING_ON_SSL_SERVER is '', then guess based on SERVER_PORT. $RUNNING_ON_SSL_SERVER= ($ENV_SERVER_PORT==443) if $RUNNING_ON_SSL_SERVER eq '' ; # Set this constant based on whether the server is IIS, because we have to # test it later for every run to work around a bug in IIS. A constant here # saves time when using mod_perl. $RUNNING_ON_IIS= ($ENV{'SERVER_SOFTWARE'}=~ /IIS/) ; # Create @NO_PROXY from $NO_PROXY for efficiency. @NO_PROXY= split(/\s*,\s*/, $NO_PROXY) ; # Base64-encode $PROXY_AUTH and $SSL_PROXY_AUTH if they're not encoded already. $PROXY_AUTH= &base64($PROXY_AUTH) if $PROXY_AUTH=~ /:/ ; $SSL_PROXY_AUTH= &base64($SSL_PROXY_AUTH) if $SSL_PROXY_AUTH=~ /:/ ; # Guarantee URLs in @PROXY_GROUP have no trailing slash. foreach (@PROXY_GROUP) { s#/$## } # Create $NO_CACHE_HEADERS depending on $MINIMIZE_CACHING setting; it is placed # in every response. Note that in all the "here documents" we use for error # messages, it has to go on the same line as another header to avoid a blank # line in the response. $NO_CACHE_HEADERS= $MINIMIZE_CACHING ? "Cache-Control: no-cache\015\012Pragma: no-cache\015\012" : '' ; # Canonicalize all MIME types to lowercase. for (@SCRIPT_MIME_TYPES) { $_= lc } for (@OTHER_TYPES_TO_REGISTER) { $_= lc } # Create @ALL_TYPES and %MIME_TYPE_ID, which are inverses of each other. # This is useful e.g. to identify the MIME type expected in a given download, # in a one-character flag. That's why we limit this to 64 types for now. # $ALL_TYPES[0] is '', so we can test e.g. "if $MIME_TYPE_ID{$id} ..." . @ALL_TYPES= ('', @SCRIPT_MIME_TYPES, @OTHER_TYPES_TO_REGISTER) ; &HTMLdie("Too many MIME types to register.") if @ALL_TYPES > 64 ; @MIME_TYPE_ID{@ALL_TYPES}= 0..$#ALL_TYPES ; # Regex that matches a script MIME type. $SCRIPT_TYPE_REGEX= '(' . join("|", @SCRIPT_MIME_TYPES) . ')' ; # Regex that tells us whether we handle a given MIME type. $TYPES_TO_HANDLE_REGEX= '(' . join("|", @TYPES_TO_HANDLE) . ')' ; # Set $THIS_HOST to the best guess how this script was called-- use the # Host: request header if available; otherwise, use SERVER_NAME. # We don't bother with a $THIS_PORT, since it's more reliably set to the port # through which the script was called. SERVER_NAME is much more likely to # be different from the hostname that the user sees, since one server may # handle many domains or have many hostnames. if ($ENV{'HTTP_HOST'} ne '') { ($THIS_HOST)= $ENV{'HTTP_HOST'}=~ m#^(?:[\w+.-]+://)?([^:/?]*)# ; $THIS_HOST= $ENV{'SERVER_NAME'} if $THIS_HOST eq '' ; } else { $THIS_HOST= $ENV{'SERVER_NAME'} ; } # Build the constant $THIS_SCRIPT_URL from environment variables. Only include # SERVER_PORT if it's not 80 (or 443 for SSL). $THIS_SCRIPT_URL= $RUNNING_ON_SSL_SERVER ? 'https://' . $THIS_HOST . ($ENV_SERVER_PORT==443 ? '' : ':' . $ENV_SERVER_PORT) . $ENV_SCRIPT_NAME : 'http://' . $THIS_HOST . ($ENV_SERVER_PORT==80 ? '' : ':' . $ENV_SERVER_PORT) . $ENV_SCRIPT_NAME ; # End of initialization of constants. $HAS_BEGUN= 1 ; } # unless ($HAS_BEGUN) #-------------------------------------------------------------------------- # Global constants are now set. Now do any initialization that is # required for every run. #-------------------------------------------------------------------------- # OK, let's time this thing #$starttime= time ; #my($sutime,$sstime)= (times)[0,1] ; # This is needed to run an NPH script under mod_perl. # Other stuff needed for mod_perl: # must use at least Perl 5.004, or STDIN and STDOUT won't behave correctly; # cannot use exit(); # must initialize or reset all vars; # regex's with /o option retain state between calls, so be careful; # typeglobbing of *STDIN doesn't work, so must pass filehandles as strings. local($|)= 1 ; # In mod_perl, global variables are retained between calls, so they must # be initialized correctly. In this program, (most) UPPER_CASE variables # are persistent constants, i.e. they aren't changed after they're # initialized above (in the $HAS_BEGUN block). We also assume that no # lower_case variables are set before here. It's a little hacky and possibly # error-prone if user customizations don't follow these conventions, but it's # fast and simple. # So, if you're using mod_perl and you make changes to this script, don't # modify existing UPPER_CASE variables after the $HAS_BEGUN block above, # don't set lower_case variables before here, and don't use UPPER_CASE # variables for anything that will vary from run to run. reset 'a-z' ; $URL= '' ; # (almost) only uppercase variable that varies from run to run # Store $now rather than calling time() multiple times. $now= time ; # for (@goodmen) # This script uses whatever version of HTTP the client is using. So far # only 1.0 and 1.1 are supported. ($HTTP_VERSION)= $ENV{'SERVER_PROTOCOL'}=~ m#^HTTP/(\d+\.\d+)#i ; $HTTP_VERSION= '1.0' unless $HTTP_VERSION=~ /^1\.[01]$/ ; # Hack to support non-NPH installation-- luckily, the format of a # non-NPH response is almost exactly the same as an NPH response. # The main difference is the first word in the status line-- something # like "HTTP/1.x 200 OK" can be simulated with "Status: 200 OK", as # long as the server supports the Status: CGI response header. So, # we set that first word to either "HTTP/1.x" or "Status:", and use # it for all responses throughout the script. # NOTE: This is not the only difference between an NPH and a non-NPH # response. For example, the Location: header has different semantics # between the two types of responses. This hack is only an approximation # that we hope works most of the time. It's better to install the script # as an NPH script if possible (which it almost always is). # Technically, the HTTP version in the response is supposed to be the highest # version supported by the server, even though the rest of the response may # be in the format of an earlier version. Unfortunately, CGI scripts do # not have access to that value; it's a hole in the CGI standard. $HTTP_1_X= $NOT_RUNNING_AS_NPH ? 'Status:' : "HTTP/$HTTP_VERSION" ; # Fix submitted by Alex Freed: Under some unidentified conditions, # instances of nph-proxy.cgi can hang around for many hours and drag the # system. So until we figure out why that is, here's a 10-minute timeout. # Please write me with any insight into this, since I can't reproduce the # problem. Under what conditions, on what systems, does it happen? # 9-9-1999: One theory is that it's a bug in older Apaches, and is fixed by # upgrading to Apache 1.3.6 or better. Julian Haight reports seeing the # same problem with other scripts on Apache 1.3.3, and it cleared up when # he upgraded to Apache 1.3.6. Let me know if you can confirm this. # alarm() is missing on some systems (such as Windows), so use eval{} to # avoid failing when alarm() isn't available. $SIG{'ALRM'} = \&timeexit ; eval { alarm(600) } ; # use where it works, ignore where it doesn't # Exit upon timeout. If you wish, add code to clean up and log an error. sub timeexit { $ENV{'MOD_PERL'} ? goto EXIT : exit 1 } # Fix any environment variables that the server may have set wrong. # Note that some constant environment variables are copied to variables above, # and fixed there. # The IIS server doesn't set PATH_INFO correctly-- it sets it to the entire # request URI, rather than just the part after the script name. So fix it # here if we're running on IIS. Thanks to Dave Moscovitz for the info! $ENV{'PATH_INFO'} =~ s/^$ENV_SCRIPT_NAME// if $RUNNING_ON_IIS ; # PATH_INFO may or may not be URL-encoded when we get it; it seems to vary # by server. This script assumes it's still encoded. Thus, if it's not, # we need to re-encode it. # The only time this seems to come up is when spaces are in URLs, correctly # represented in the URL as %20 but decoded to " " in PATH_INFO. Thus, # this hack only focuses on space characters. It's a hack that I'm not at # all comfortable with. :P # Very yucky business, this encoding thing. if ($ENV{'PATH_INFO'}=~ / /) { $ENV{'PATH_INFO'} =~ s/%/%25/g ; $ENV{'PATH_INFO'} =~ s/ /%20/g ; } # Copy often-used environment vars into scalars, for efficiency $env_accept= $ENV{'HTTP_ACCEPT'} || '*/*' ; # may be modified later # PATH_INFO consists of a path segment of flags, followed by the encoded # target URL. For example, PATH_INFO might be something like # "/010100A/http/www.example.com". The actual format of the flag segment # is defined in the routine pack_flags(). # Thanks to Mike Harding for the idea of using another flag for the # $is_in_frame parameter, instead of using two parallel scripts. # Extract flags and encoded URL from PATH_INFO. ($packed_flags, $encoded_URL)= $ENV{'PATH_INFO'}=~ m#/([^/]*)/?(.*)# ; # Set all $e_xxx variables ("effective-xxx") and anything else from flag # segment of PATH_INFO. If user config is not allowed or if flag segment # is not present, then set $e_xxx variables from hard-coded config variables # instead (but still set anything else as needed from PATH_INFO). if ( $ALLOW_USER_CONFIG && ($packed_flags ne '') ) { ($e_remove_cookies, $e_remove_scripts, $e_filter_ads, $e_hide_referer, $e_insert_entry_form, $is_in_frame, $expected_type)= &unpack_flags($packed_flags) ; } else { # $is_in_frame is set in any case. It indicates whether the current # request will be placed in a frame. ($e_remove_cookies, $e_remove_scripts, $e_filter_ads, $e_hide_referer, $e_insert_entry_form, $is_in_frame, $expected_type)= ($REMOVE_COOKIES, $REMOVE_SCRIPTS, $FILTER_ADS, $HIDE_REFERER, $INSERT_ENTRY_FORM, (&unpack_flags($packed_flags))[5..6] ) ; } # Set any other $e_xxx variables not from flag segment [none currently]. # Flags are now set, and $encoded_URL now contains only the encoded target URL. # Create a one-flag test for whether we're inserting anything into THIS page. # This must happen after user flags are read, just above. $doing_insert_here= !$is_in_frame && ( $e_insert_entry_form || ($INSERT_FILE ne '') || ($INSERT_HTML ne '') ) ; # One user reported problems with binary files on certain other OS's, and # this seemed to fix it. Supposedly, either this or the "binmode S" # statements below the newsocketto() calls work, or all; I'm putting all in. # Tell me anything new you figure out about this. binmode STDOUT ; #-------------------------------------------------------------------------- # parse URL, make checks, and set various globals #-------------------------------------------------------------------------- # Calculate $url_start for use later in &full_url() and elsewhere. It's an # integral part of &full_url(), placed here for speed, similar to the # variables set in &fix_base_vars. # $url_start is the first part of every proxified URL. A complete proxified # URL is made by appending &proxy_encode(URL) (and possibly a #fragment) to # $url_start. $url_start normally consists of the current script's URL # (or one from @PROXY_GROUP), plus a flag segment in PATH_INFO, complete # with trailing slash. For example, a complete $url_start might be # "http://www.example.com/path/nph-proxy.cgi/010110A/" . # $url_start_inframe and $url_start_noframe are used to force the frame flag # on or off, for example when proxifying a link that causes frames to be # entered or exited. Otherwise, most links inherit the current frame state. # $script_url is used later for Referer: support, and whenever a temporary # copy of $url_start has to be generated. # In earlier versions of CGIProxy, $url_start was called $this_url, which is # really what it was originally. Its semantics had drifted somewhat since # then, so they have been cleaned up, and $url_start is now more descriptive. # Set $url_start to a random element of @PROXY_GROUP, if that is set. if (@PROXY_GROUP) { # srand is automatically called in Perl 5.004 and later. It might be # desirable to seed based on the URL, so that multiple requests for # the same URL go through the same proxy, and may thus be cached. #srand( unpack('%32L*', $ENV{'PATH_INFO'}) ) ; # seed with URL+flags $script_url= $PROXY_GROUP[ rand(scalar @PROXY_GROUP) ] ; } else { $script_url= $THIS_SCRIPT_URL ; } # Create $url_start and any needed variants: "$script_url/flags/" $url_start_inframe= $script_url . '/' . &pack_flags($e_remove_cookies, $e_remove_scripts, $e_filter_ads, $e_hide_referer, $e_insert_entry_form, 1, '') . '/' ; $url_start_noframe= $script_url . '/' . &pack_flags($e_remove_cookies, $e_remove_scripts, $e_filter_ads, $e_hide_referer, $e_insert_entry_form, 0, '') . '/' ; $url_start= $is_in_frame ? $url_start_inframe : $url_start_noframe ; # If there's no $encoded_URL, then start a browsing session. &show_start_form() if $encoded_URL eq '' ; # Decode the URL. $URL= &proxy_decode($encoded_URL) ; # Set the query string correctly, from either $ENV{QUERY_STRING} or what's # already in $URL. # The query string may exist in either the encoded URL or in the containing # URL, as $ENV{QUERY_STRING}. If the former, then the query string was # (definitely?) in a referenced URL, while the latter most likely implies a # GET form input. Either query string is valid, but form input takes # precedence-- if $ENV{QUERY_STRING} exists, it should be used over any # query string in the encoded URL. # Note that Netscape does not pass any query string data that is part of the # URL in the
attribute, which is probably correct behaviour. # For this program to act exactly the same, it would need to strip the # query string when updating all URLs, way below. # Question: Is there ever a valid case when both QUERY_STRINGs exist?? $URL=~ s/(\?.*)?$/?$ENV{'QUERY_STRING'}/ if $ENV{'QUERY_STRING'} ne '' ; # Parse the URL, using a regex modelled from the one in RFC 2396 (URI syntax), # appendix B. # This assumes a hierarchical scheme; it won't work for e.g. mailto: # "authority" is the combination of host, port, and possibly other info. # Note that $path here will also contain any query component; it's more like # the request URI. # Note that $URL is guaranteed to be an absolute URL with no "#" fragment, # though this does little error-checking. Note also that the old ";" # parameters are now included in the path component. ($scheme, $authority, $path)= ($URL=~ m#^([\w+.-]+)://([^/?]*)(.*)$#i) ; $scheme= lc($scheme) ; $path= "/$path" if $path!~ m#^/# ; # if path is '' or contains only query # Magic here-- if $URL uses special scheme "x-proxy", immediately call the # general-purpose xproxy() routine. &xproxy($URL) if $scheme eq 'x-proxy' ; # Set $is_html if $path (minus query) ends in .htm or .html . # MSIE has a bug (and privacy hole) whereby URLs with QUERY_STRING ending # in .htm or .html are mistakenly treated as HTML, and thus could have # untranslated links, # or tags. This is most likely what the HTML author expects # anyway, though it violates the HTML spec. In this script, we should # over-proxify rather than under-proxify, so we'll end those blocks on # those end tags as browsers (erroneously) do. # Worse, Konqueror allows the string "" inside JS literal strings, # i.e. doesn't end the script block on them. Netscape does end the block # there, and both browsers end style blocks on embedded strings. # Because it's a given that we can't anonymize scripts completely, but # we do want to anonymize HTML completely, we'd rather accidentally # treat script content as HTML than the other way around. So err on # ending the " regardless of whether it's in a string. # (We'd end on " blocks, conditional comments, # intrinsic event attributes ("on___" attributes), script macros, and # the MSIE-specific "dynamic properties". These can be removed or # proxified, depending on the settings of $scripts_are_banned_here and # $PROXIFY_SCRIPTS. # Script content can also exist elsewhere when its MIME type is explicitly # given (for example, in a ') ; # Handle any declarations. # Declarations can contain URLs, such as for DTD's. Most legitimate # declarations would be safe if left unconverted, but if we don't # convert URLs then a malicious document could use this mechanism # to break privacy. Here we use a simple method to handle virtually # all existing cases and close all privacy holes. } elsif ($declaration) { my($inside, @words, $q, $rebuild) ; ($inside)= $declaration=~ /^]*)/ ; @words= $inside=~ /\s*("[^">]*"?|'[^'>]*'?|[^'"][^\s>]*)/g ; # Remember position of first tag, for insertions later. # This should only matter when is first tag, thus only # when @out<2 (don't forget push(@out,$start) above). But verify # this if any other push()'s are added to the code. Hack hack. $doctype_pos= @out+1 if (@out<2) && (lc($words[0]) eq 'doctype') ; # Instead of handling all SGML declarations, the quick hack here is # to convert any "word" in it that looks like an absolute URL. It # handles virtually all existing cases well enough, and closes any # privacy hole regardless of the declaration. foreach (@words) { if (m#^["']?[\w+.-]+://#) { if (/^"/) { $q= '"' ; s/^"|"$//g } elsif (/^"/) { $q= '"' ; s/^"|"$//g } else { $q= '' } $_= $q . &HTMLescape(&full_url(&HTMLunescape($_))) . $q ; $rebuild= 1 ; } } $declaration= '' if $rebuild ; push(@out, $declaration) ; } # end of main if comment/script/style/declaration/tag block } # end of main while loop # @out now has proxified HTML # Now, insert form and/or other header as needed, if we're not in a frame. # Insert it right after the tag if available, else right after the # tag, else at the beginning. # Only do this if we're proxifying an entire page, not if we're proxifying # an HTML fragment (as indicated by the $is_full_page flag). if ($is_full_page) { splice(@out, ($body_pos || $html_pos || $doctype_pos), 0, &full_insertion($URL,0)) if $doing_insert_here ; # Putting something (even a comment) before confuses some # browsers (like MSIE 6.0), so any insertion should go after that. # This only matters when the is the first tag, so # $doctype_pos is presumably only set when that's true. # Prepend newline if it's after a . splice(@out, $doctype_pos, 0, ($doctype_pos ? "\n" : ''), "\n") ; } return join('', @out) ; } # sub proxify_html() #---------------------------------------------