#!/usr/bin/env python
"""Parse HTML files and encode email addresses to hide them from spammers.
This program uses numerical HTML entities, either in decimal (default) or
hexadecimal encoding.
Author: Andreas Neudecker
Created: 2003-07-30
Changed: 2003-10-01
Version: 0.0.3
Copyright (c) 2003 by Andreas Neudecker.
Licensed under the GNU General Public License (GPL),
see http://www.gnu.org/copyleft/ for details.
"""
# Changelog:
# (Add change notes chronologially. Include name, email, date)
# [2003-09-28] Added terrific new feature. (E. Xample, e.xample@some-email.net)
# [2003-10-01] Much better regular expression for finding emails, thanks to
# Michael Wurzel; more minor changes. (Andreas, zapyon@gmx.net)
# Comment conventions:
# [???] = Problem to be solved
# [!!!] = Needs testing/editing
# [2do] = Needs to be implemented/changed/optimised
# You should add a usefull explanation.
# Execute always --------------------------------------------------------------#
# Use Python JIT Psyco to speed up execution. (http://psyco.sourceforge.net/)
#
# Needs to be imported BEFORE other modules to allow them to be optimised at
# runtime. Makes most sense to import and run Psyco at the start of your
# program's main module!
# Set 'usePsyco' to false while testing. Leave it on for finished code.
# It is NOT necessary to switch off (False) on systems without Psyco installed.
usePsyco = False
# set to true, if you want to use it AND have verbosity.py
#useVerbosity = True
if usePsyco and __name__ == '__main__':
try:
import psyco
psyco.profile()
except:
print 'Psyco not found, ignoring it'
# Modules ---------------------------------------------------------------------#
#from optik import *
#import string
import re
import random as R
# Globals ---------------------------------------------------------------------#
verboseTesting = True
# Classes ---------------------------------------------------------------------#
# Functions -------------------------------------------------------------------#
def obfuscate (string, mode = 'decimal'):
"""Encode string using numeric HTML entities.
Possible encoding modes for the entities are:
'decimal', 'dec', 'd': decimal encoding (default)
'hexadecimal', 'hex', 'h': hexadecimal encoding
'mixed', 'mix', 'm': mixed hex and dec encoding
'none', 'n': do not encode
"""
modeUse = {
'decimal' : decHtmlEnt,
'dec' : decHtmlEnt,
'd' : decHtmlEnt,
'hexadecimal' : hexHtmlEnt,
'hex' : hexHtmlEnt,
'h' : hexHtmlEnt,
'mixed' : mixHtmlEnt,
'mix' : mixHtmlEnt,
'm' : mixHtmlEnt,
}
modeKeys = modeUse.keys ()
if mode in modeKeys:
return ''.join (map (modeUse [mode], string))
else: # wrong mode or "don't encode" selected.
return string
def decHtmlEnt (string):
"""Encode letters in 'string' as decimal HTML entities &;"""
return ''.join ( ['' + str (ord (ch)) + ';' for ch in string] )
def hexHtmlEnt (string):
"""Encode letters in 'string' as hexadecimal HTML entities &;"""
return ''.join ( ['' + hex (ord (ch)) [1:] + ';' for ch in string] )
def mixHtmlEnt (string):
"""Encode 'string' as random mix of hex and decimal encoded HTML entities."""
return ''.join ( [ (posRandInt (1) and [decHtmlEnt (ch)] or [hexHtmlEnt (ch)])[0] for ch in string ] )
def noEncoding (string):
"""Do not manipulate 'string'."""
return string
def posRandInt (n, count = None):
"""Pick a random positive integer in the range of [0, n].
If count is given, return a list of 'count' random positive integers.
Else, return just one.
"""
if count:
return [int (round (R.random () * n)) for i in range (count)]
else:
return int (round (R.random () * n))
def encLine (line, mode = 'dec'):
"""Encode email addresses and URIs in a line as desired.
line: a 'string', usually one line of a HTML file.
mode: the email address string is encoded using numerical HTML
entities ('&[#];'). Allowed values are:
'decimal', 'dec', 'd': decimal encoding (default)
'hexadecimal', 'hex', 'h': hexadecimal encoding
'mixed', 'mix', 'm': mixed hex and dec encoding
For all other values, including None the original string
is returned.
"""
# Prepare regular expressions
charClass = r"[a-zA-Z_\-.0-9]"
email = re.compile (r"(%s * @ %s * ) | mailto:" % (charClass, charClass), re.VERBOSE)
return replExpr (email, line, mode)
def replExpr (regx, line, mode):
"""Use regular expression 'regx' to perform replacements in line."""
def repl(mtch):
return obfuscate(mtch.group(0),mode)
return regx.subn(repl,line)[0]
# main () ---------------------------------------------------------------------#
def __main ():
a = '''
a.aa@bbb.ccc, def
(ddd.ddd@eee.ff),
ghi (g@h.i)
'''
print encLine (a)
if __name__ == '__main__':
__main ()
# EOF -------------------------------------------------------------------------#