#!/usr/bin/env python """Parse HTML files and encode email addresses to hide them from spammers. This program uses numerical HTML entities, either in decimal (default) or hexadecimal encoding. Author: Andreas Neudecker Created: 2003-07-30 Changed: 2003-10-01 Version: 0.0.3 Copyright (c) 2003 by Andreas Neudecker. Licensed under the GNU General Public License (GPL), see http://www.gnu.org/copyleft/ for details. """ # Changelog: # (Add change notes chronologially. Include name, email, date) # [2003-09-28] Added terrific new feature. (E. Xample, e.xample@some-email.net) # [2003-10-01] Much better regular expression for finding emails, thanks to # Michael Wurzel; more minor changes. (Andreas, zapyon@gmx.net) # Comment conventions: # [???] = Problem to be solved # [!!!] = Needs testing/editing # [2do] = Needs to be implemented/changed/optimised # You should add a usefull explanation. # Execute always --------------------------------------------------------------# # Use Python JIT Psyco to speed up execution. (http://psyco.sourceforge.net/) # # Needs to be imported BEFORE other modules to allow them to be optimised at # runtime. Makes most sense to import and run Psyco at the start of your # program's main module! # Set 'usePsyco' to false while testing. Leave it on for finished code. # It is NOT necessary to switch off (False) on systems without Psyco installed. usePsyco = False # set to true, if you want to use it AND have verbosity.py #useVerbosity = True if usePsyco and __name__ == '__main__': try: import psyco psyco.profile() except: print 'Psyco not found, ignoring it' # Modules ---------------------------------------------------------------------# #from optik import * #import string import re import random as R # Globals ---------------------------------------------------------------------# verboseTesting = True # Classes ---------------------------------------------------------------------# # Functions -------------------------------------------------------------------# def obfuscate (string, mode = 'decimal'): """Encode string using numeric HTML entities. Possible encoding modes for the entities are: 'decimal', 'dec', 'd': decimal encoding (default) 'hexadecimal', 'hex', 'h': hexadecimal encoding 'mixed', 'mix', 'm': mixed hex and dec encoding 'none', 'n': do not encode """ modeUse = { 'decimal' : decHtmlEnt, 'dec' : decHtmlEnt, 'd' : decHtmlEnt, 'hexadecimal' : hexHtmlEnt, 'hex' : hexHtmlEnt, 'h' : hexHtmlEnt, 'mixed' : mixHtmlEnt, 'mix' : mixHtmlEnt, 'm' : mixHtmlEnt, } modeKeys = modeUse.keys () if mode in modeKeys: return ''.join (map (modeUse [mode], string)) else: # wrong mode or "don't encode" selected. return string def decHtmlEnt (string): """Encode letters in 'string' as decimal HTML entities &;""" return ''.join ( ['&#' + str (ord (ch)) + ';' for ch in string] ) def hexHtmlEnt (string): """Encode letters in 'string' as hexadecimal HTML entities &;""" return ''.join ( ['&#' + hex (ord (ch)) [1:] + ';' for ch in string] ) def mixHtmlEnt (string): """Encode 'string' as random mix of hex and decimal encoded HTML entities.""" return ''.join ( [ (posRandInt (1) and [decHtmlEnt (ch)] or [hexHtmlEnt (ch)])[0] for ch in string ] ) def noEncoding (string): """Do not manipulate 'string'.""" return string def posRandInt (n, count = None): """Pick a random positive integer in the range of [0, n]. If count is given, return a list of 'count' random positive integers. Else, return just one. """ if count: return [int (round (R.random () * n)) for i in range (count)] else: return int (round (R.random () * n)) def encLine (line, mode = 'dec'): """Encode email addresses and URIs in a line as desired. line: a 'string', usually one line of a HTML file. mode: the email address string is encoded using numerical HTML entities ('&[#];'). Allowed values are: 'decimal', 'dec', 'd': decimal encoding (default) 'hexadecimal', 'hex', 'h': hexadecimal encoding 'mixed', 'mix', 'm': mixed hex and dec encoding For all other values, including None the original string is returned. """ # Prepare regular expressions charClass = r"[a-zA-Z_\-.0-9]" email = re.compile (r"(%s * @ %s * ) | mailto:" % (charClass, charClass), re.VERBOSE) return replExpr (email, line, mode) def replExpr (regx, line, mode): """Use regular expression 'regx' to perform replacements in line.""" def repl(mtch): return obfuscate(mtch.group(0),mode) return regx.subn(repl,line)[0] # main () ---------------------------------------------------------------------# def __main (): a = ''' a.aa@bbb.ccc, def (ddd.ddd@eee.ff), ghi (g@h.i) ''' print encLine (a) if __name__ == '__main__': __main () # EOF -------------------------------------------------------------------------#