Email Parser
by .hac on November 16th, 2009
Email Parser ([any name] AT [any domain name] DOT [any top-level domain]
This is a PoC parser which is written in Python.
It is using http://www.google.com/search?q=at+gmail+dot+com as an input
import re, httplib
delimiters = "[]()<>\"\""
state = 0
local_part = ""
domain = ""
top_level_domain = ""
# removing all HTML tags
def remove_html_tags(data):
p = re.compile(r'<.*?>')
result = p.sub(' ', data)
p = re.compile(r'&.*?;')
result = p.sub(' ', result)
return result
# looking for "AT"
def transition_in_state0(i, result):
global state
global local_part
if i > 0 and result[i].lower() == "at":
local_part = result[i - 1]
state = 1
# discovered "AT" and looking for "DOT"
def transition_in_state1(i, result):
global state
global local_part
global domain
global top_level_domain
if i > 0 and result[i].lower() == "at":
local_part = result[i - 1]
state = 1
elif i > 0 and i < len(result) - 1 and result[i].lower() == "dot":
domain = result[i - 1]
top_level_domain = result[i + 1]
state = 2
# Email found, printing it out
def output_email():
global local_part
global domain
global top_level_domain
print local_part + "@" + domain + "." + top_level_domain
# Transition matrix
transitions = {
0: transition_in_state0,
1: transition_in_state1,
}
# get the input and start looking for email addresses in the document
conn = httplib.HTTPConnection('www.google.com')
conn.request('GET', '/search?q=at+gmail+dot+com')
response = conn.getresponse()
input = response.read()
conn.close()
# replacing all parentheses
input = remove_html_tags(input)
for i in range(len(delimiters)):
input = input.replace(delimiters[i], " ")
result = input.split()
# applying the state machine on it
for i in range(len(result)):
transitions.get(state)(i, result)
if state == 2:
output_email()
state = 0
English
Recent Comments