# pretreatment (filtering, fixups), applied in order # the syntax is meant to be suitable for inclusion into Apache config # regexps must be in double quotes. Double quotes can be backslash-quoted. # good reference about Python regexp: http://www.amk.ca/python/howto/regex/regex.html # # short intro to things that may be special to Python: # (?: ) non-capturing group # (?P ) named group # (FIXME: need to check if all these are supported in Apache) # # This directive applies only to the "offline parsing" script. Apache doesn't see # the log line before it constructs and writes it at the end of request processing. # Thus, Apache ignores this directive. # # It serves to # 1) split a line of the log file into the relevant fragments # 2) ignore log lines that don't match # # The expression needs to result into the following six match groups: # (IP, timestamp, url, status, referer, ua, country) # FIXME: country should be optional, because it occurs only in a MirrorBrain logfile # # 123.123.123.123 - - [23/Nov/2009:18:19:14 +0100] "GET /files/stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg HTTP/1.1" 302 399 "http://download.openoffice.org/all_rc.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618)" ftp.astral.ro r:country 913 844 EU:RO ASN:9050 P:92.81.0.0/16 size:24661382 - # # 200 is returned for files that are not on mirrors, and for metalinks # StatsLogMask "^(\S+).+\[(.*?)\] \"GET (\S*) HTTP.*\" (200|302) [^\"]+ \"([^\"]*)\" \"([^\"]*)\".* \w\w:(\w\w) ASN:" # FIXME: we should have a separate directive that determines which status codes # are considered for logging, which is read both by the script and by # Apache # #StatsLogStatus 200 #StatsLogStatus 302 # # Before doing anything else, silently ignore all files matching the following # regular expression. (All others will be considered for counting.) # # ignore all requests that come with query string StatsIgnoreMask ".*\?.*" # # ignore files with these endings StatsIgnoreMask "^.*\.(txt|list|html)$" # Ignore all requests from this host (string prefix match), and don't process # the log lines at all. StatsIgnoreIP 140.211.167.212 # # Drop recurring identical requests # # define the size of a sliding window for remembering the last requests, # while parsing the log. Keyed by (ip, url, referer, user-agent), # every requests is checked whether it has been seen in identical form before. StatsDupWindow 200 # FIXME: Apache will have to filter on time instead of number of requests, # for practical reasons (memcached automates this nicely) # # Apply the following series of filters to the request URL # # strip prefixed protocol (normally only sent to proxies, but can occur in the wild) # (the filter is applied to the requested url.) StatsPrefilter "^http://[^/]+/" "" # remove duplicated slashes StatsPrefilter "/+" "/" # remove an optional timestamp (example: _20091121) StatsPrefilter "_[0-9]{8}" "" # strip leading base path StatsPrefilter "^/files/" "" # strip appended .metalink suffix StatsPrefilter "\.metalink$" "" # language codes are sometimes lower case, sometimes upper # localized/en-GB/3.1.1/OOo_3.1.1_Win32Intel_langpack_en-GB.exe # localized/zh-cn/3.1.0/OOo_3.1.0_Solarisx86_install_zh-cn.tar.gz # localized/zh-cn/3.1.1/OOo_3.1.1_LinuxIntel_install_zh-CN.tar.gz StatsPrefilter "zh-cn" "zh-CN" StatsPrefilter "zh-tw" "zh-TW" # # StatsCount translates the remaining URL into the pieces to be logged # # FIXME: This assumes that there need to be 4 pieces. Fewer should work as well. # stable/3.1.1/OOo_3.1.1_Win32Intel_install_en-US.exe # stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg # stable/3.1.1/OOo_3.1.1_Win32Intel_install_wJRE_en-US.exe # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_LinuxIntel_langpack_brx_deb.tar.gz # extended/3.1.1rc2/BrOOo_3.1.1rc2_LinuxIntel_install_pt-BR_deb.tar.gz # extended/developer/DEV300_m65/OOo-Dev-SDK_DEV300_m65_Win32Intel_install_en-US.exe StatsCount "^(?:stable|extended)/(?:developer/)?([^/]+)/(OOo|OOo-SDK|OOo-Dev|OOo-Dev-SDK|BrOOo)_\1_(.+)_(?P([a-zA-Z]{2}(-[a-zA-Z]{2})?|binfilter|core|l10n|extensions|system|testautomation|brx|dgo|kok|mai|mni|sat))(_deb|_rpm)?\.(exe|dmg|sh|tar\.gz|tar\.bz2)$" "prod: \2 os: \3 version: \1 lang: \g" # localized/vi/3.1.1/OOo_3.1.1_Win32Intel_langpack_vi.exe # localized/zh-cn/3.1.0/OOo_3.1.0_Solarisx86_install_zh-cn.tar.gz # extended/localized/mk/3.0.0/OOo_3.0.0_LinuxIntel_install_mk_deb.tar.gz StatsCount "^(?:extended/)?localized/([^/]+)/([^/]+)/(OOo)_\2_(.+)_\1(_deb|_rpm)?\.(exe|dmg|sh|tar\.gz|tar\.bz2)$" "prod: \3 os: \4 version: \2 lang: \1" # Iso images # extended/iso/en/OO.o_3.1.1_LinuxIntel_en-US.iso # extended/iso/en/OO.o_3.1.1_MacOSXIntel_en-US.iso # extended/iso/en/OO.o_3.1.1_Win32Intel_en-US.iso StatsCount "^extended/iso/([^/]+)/OO\.o_([^_]+)_(.+)_.*\.iso$" "prod: iso os: \3 version: \2 lang: \1" # extended/iso/de/prooo-box-cd/prooo-box-3.1.1-1_CD_Win_de.iso StatsCount "^extended/iso/([^/]+)/prooo-box-cd/prooo-box-([^_]+)-1_(.+)_.*\.iso$" "prod: iso os: \3 version: \2 lang: \1" # this one is quite a special case (I think it is 3.1.0) # extended/iso/en/ooo31_install_allplatforms_dvd.iso StatsCount "^extended/iso/([^/]+)/ooo31_(.+)_.*\.iso$" "prod: iso os: \2 version: 3.1.0 lang: \1" # contrib/dictionaries/thes_es_ES.zip # contrib/dictionaries/thes_pt_PT_v2.zip StatsCount "^contrib/dictionaries/(thes)_(.*?)(_v2)?.zip$" "prod: \1 os: all version: all lang: \2" # # Filters to be applied after parsing (but still before counting) # #StatsPostfilter "foo" "bar" StatsPostfilter "(prod|os|version|lang): " "" # FIXME: we didn't need to add those words in the first place; not needed at all. # vim: ft=apache ai ts=4 sw=4 smarttab expandtab smarttab