Email Parser

by .hac on November 16th, 2009

Email Parser ([any name] AT [any domain name] DOT [any top-level domain]

This is a PoC parser which is written in Python.
It is using http://www.google.com/search?q=at+gmail+dot+com as an input

import re, httplib

delimiters = "[]()<>\"\""
state = 0
local_part = ""
domain = ""
top_level_domain = ""

# removing all HTML tags
def remove_html_tags(data):
	p = re.compile(r'<.*?>')
	result = p.sub(' ', data)
	p = re.compile(r'&.*?;')
	result = p.sub(' ', result)
	return result

# looking for "AT"
def transition_in_state0(i, result):
	global state
	global local_part
	if i > 0 and result[i].lower() == "at":
		local_part = result[i - 1]
		state = 1

# discovered "AT" and looking for "DOT"
def transition_in_state1(i, result):
	global state
	global local_part
	global domain
	global top_level_domain
	if i > 0 and result[i].lower() == "at":
		local_part = result[i - 1]
		state = 1
	elif i > 0 and i < len(result) - 1 and result[i].lower() == "dot":
		domain = result[i - 1]
		top_level_domain = result[i + 1]
		state = 2

# Email found, printing it out
def output_email():
	global local_part
	global domain
	global top_level_domain
	print local_part + "@" + domain + "." + top_level_domain

# Transition matrix
transitions = {
	0: transition_in_state0,
	1: transition_in_state1,
}

# get the input and start looking for email addresses in the document
conn = httplib.HTTPConnection('www.google.com')
conn.request('GET', '/search?q=at+gmail+dot+com')
response = conn.getresponse()
input = response.read()
conn.close()

# replacing all parentheses
input = remove_html_tags(input)
for i in range(len(delimiters)):
	input = input.replace(delimiters[i], " ")

result = input.split()

# applying the state machine on it
for i in range(len(result)):
	transitions.get(state)(i, result)

	if state == 2:
		output_email()
		state = 0
  1. No comments yet.
  1. No trackbacks yet.