D. Gibbon: Soundex in Python
def soundex(name):
#Render as upper case
name = name.upper()
# Separate
first=name[0]
rest=name[1:]
#Remove punctuation
rest=rest.upper()
rest=re.sub("[.,:;-]",'',rest)
#1. Perform numerical substitutions for consonants
# ***(h, w, y missing in Bird NLTK version)***
rest=re.sub('[AEIOUHWY]','',rest)
rest=re.sub("'",'',rest)
rest=re.sub('[BFPV]','1',rest)
rest=re.sub('[CGJKQSXZ]','2',rest)
rest=re.sub('[DT]','3',rest)
rest=re.sub('[L]','4',rest)
rest=re.sub('[MN]','5',rest)
rest=re.sub('[R]','6',rest)
#2. Collapse adjacent identical digits
rest=rest+'_'
newstring=''
for n in range(len(rest)-1):
if rest[n]!=rest[n+1]:
newstring=newstring+rest[n]
rest=newstring
#3. Remove non-digits
rest=re.sub('\D','',rest)
#4. Right-pad with zeroes, keep 1st 3 digits
rest=rest+"000"
rest=rest[:3]
# Restore first letter
soundexresult=first+rest
return soundexresult