# -*- Mode: Python; tab-width: 4 -*-
# $Id: parsdate.py,v 1.3 1996/04/11 06:58:40 rushing Exp $

# Parse various date headers you might find in mail or news, including
# some common violations of rfc822
#
# Author: Sam Rushing (rushing@nightmare.com)
#
# Much of this is a translation of timezone.el from GNU Emacs,
# in particular timezone.el's regexps are taken without modification.
# Many thanks to Masanobu Umeda, Rich Salz and anyone else who's
# contributed to timezone.el or parsedate.y

# I've tested this with ~10,000 date headers from my own mail,
# and it matches parsedate.y on all that can be parsed by both.
# This module can handle several formats that parsedate.y can't.

import regex
import string
import time

error = "parsedate error"

short_days = '\\(mon\\|tue\\|wed\\|thu\\|fri\\|sat\\|sun\\)'
long_days = '\\(monday\\|tuesday\\|wednesday\\|thursday\\|friday\\|saturday\\|sunday\\)'

# strip off extra leading whitespace, and the day of the week
dow_stripper = regex.compile (
	'[ \t]*\\(\\(%s\\|%s\\),?[ \t]*\\)?' % (long_days, short_days),
	regex.casefold
	)

# from timezone.el, the function timezone-parse-date
# Understands the following styles:
#  (1) 14 Apr 89 03:20[:12] [GMT]
#  (2) Fri, 17 Mar 89 4:01[:33] [GMT]
#  (3) Mon Jan 16 16:12[:37] [GMT] 1989
#  (4) 6 May 1992 1641-JST (Wednesday)
#  (5) 22-AUG-1993 10:59:12.82"
# seems I have to strip the dow off, first (SMR)
# RFC850 lives on, in HTTP.
#  (6) Weekday, DD-Mon-YY HH:MM:SS TIMEZONE

# regular expressions are your friends!
date_regex_list =[ \
	# Styles: (1) and (2) without timezone
	(regex.compile ("\\([0-9]+\\)[ \t]+\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9:]+\\)[ \t]*\\'"),
	 (3,2,1,4,None)),
	# Styles: (1) and (2) with timezone and buggy timezone
	(regex.compile ("\\([0-9]+\\)[ \t]+\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9:]+\\)[ \t]*\\([-+a-zA-Z0-9]+\\)"),
	 (3,2,1,4,5)),
	# Styles: (3) without timezone
	(regex.compile ("\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9:]+\\)[ \t]+\\([0-9]+\\)"),
	 (4,1,2,3,None)),
	# Styles: (3) with timezone
	(regex.compile ("\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9:]+\\)[ \t]+\\([-+a-zA-Z0-9]+\\)[ \t]+\\([0-9]+\\)"),
	 (5,1,2,3,4)),
	# Styles: (4) with timezone
	(regex.compile ("\\([0-9]+\\)[ \t]+\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+\\)[ \t]*\\([-+a-zA-Z0-9]+\\)"),
	 (3,2,1,4,5)),
	# Styles: (5) without timezone.
	(regex.compile ("\\([0-9]+\\)-\\([A-Za-z]+\\)-\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9]+:[0-9]+\\)\\.[0-9]+"),
	 (3,2,1,4,None)),
	# Styles: (6) with timezone.
	(regex.compile ("\\([0-9]+\\)-\\([A-Za-z]+\\)-\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9]+:[0-9]+\\)[ \t]*\\([-+a-zA-Z0-9]+\\)"),
	 (3,2,1,4,5))
	]

# identify the various parts of a date field,
# return (year, month, day, time, timezone)

def partition_date (date):
	skip = dow_stripper.match (date)
	if skip != -1:
		date = date[skip:]
	for reg, idx in date_regex_list:
		if reg.match(date) == len(date):
			if idx[4] == None:
				tz = None
			else:
				tz = reg.group(idx[4])
			return (reg.group(idx[0]),
					reg.group(idx[1]),
					reg.group(idx[2]),
					reg.group(idx[3]),
					tz)
	raise error, "couldn't partition date"

# format is (regex, seconds_present)
time_regex_list = [ \
	# HH:MM:SS
	(regex.compile("\\`\\([0-9]+\\):\\([0-9]+\\):\\([0-9]+\\)\\'"), 			1),
	# HH:MM
	(regex.compile("\\`\\([0-9]+\\):\\([0-9]+\\)\\'"), 							0),
	# HHMMSS
	(regex.compile("\\`\\([0-9][0-9]\\)\\([0-9][0-9]\\)\\([0-9][0-9]\\)\\'"),	1),
	# HHMM
	(regex.compile("\\`\\([0-9][0-9]\\)\\([0-9][0-9]\\)\\'"), 					0)
	]

# return integer triplet of hh, mm, ss
def parse_time (time):
	for reg, secondp in time_regex_list:
		if reg.match (time) != -1:
			if secondp == 1:
				return (string.atoi(reg.group(1)),
						string.atoi(reg.group(2)),
						string.atoi(reg.group(3)))
			else:
				return (string.atoi(reg.group(1)),
						string.atoi(reg.group(2)),
						0)
	raise error, "couldn't partition time"

# this table is based on the one in 'parsedate.y', from the INN 1.4 distribution.

zone_map = { \
	"gmt":	( 0, 0, 0),	#	Greenwich Mean 
	"ut":	( 0, 0, 0),	#	Universal 
	"utc":	( 0, 0, 0),	#	Universal Coordinated 
	"cut":	( 0, 0, 0),	#	Coordinated Universal 
	"z":	( 0, 0, 0),	#	Greenwich Mean 
	"wet":	( 0, 0, 0),	#	Western European 
	"bst":	(-1, 0, 0),	#	British Summer 
	"nst":	( 0,-3,30),	#	Newfoundland Standard 
	"ndt":	(-1,-3,30),	#	Newfoundland Daylight 
	"ast":	( 0,-4, 0),	#	Atlantic Standard 
	"adt":	(-1,-4, 0),	#	Atlantic Daylight 
	"est":	(0, -5, 0),	#	Eastern Standard 
	"edt":	(-1,-5, 0),	#	Eastern Daylight 
	"cst":	( 0,-6, 0),	#	Central Standard 
	"cdt":	(-1,-6, 0),	#	Central Daylight 
	"mst":	( 0,-7, 0),	#	Mountain Standard 
	"mdt":	(-1,-7, 0),	#	Mountain Daylight 
	"pst":	( 0,-8, 0),	#	Pacific Standard 
	"pdt":	(-1,-8, 0),	#	Pacific Daylight 
	"yst":	( 0,-9, 0),	#	Yukon Standard 
	"ydt":	(-1,-9, 0),	#	Yukon Daylight 
	"akst":	( 0,-9, 0),	#	Alaska Standard 
	"akdt":	(-1,-9, 0),	#	Alaska Daylight 
	"hst":	( 0,-10,0),	#	Hawaii Standard 
	"hast":	( 0,-10,0),	#	Hawaii-Aleutian Standard 
	"hadt":	(-1,-10,0),	#	Hawaii-Aleutian Daylight 
	"ces":	(-1, 1, 0),	#	Central European Summer 
	"cest":	(-1, 1, 0),	#	Central European Summer 
	"mez":	( 0, 1, 0),	#	Middle European 
	"mezt":	(-1, 1, 0),	#	Middle European Summer 
	"cet":	( 0, 1, 0),	#	Central European 
	"met":	( 0, 1, 0),	#	Middle European 
	"eet":	( 0, 2, 0),	#	Eastern Europe 
	"msk":	( 0, 3, 0),	#	Moscow Winter 
	"msd":	(-1, 3, 0),	#	Moscow Summer 
	"wast":	( 0, 8, 0),	#	West Australian Standard 
	"wadt":	(-1, 8, 0),	#	West Australian Daylight 
	"hkt":	( 0, 8, 0),	#	Hong Kong 
	"cct":	( 0, 8, 0),	#	China Coast 
	"jst":	( 0, 9, 0),	#	Japan Standard 
	"kst":	( 0, 9, 0),	#	Korean Standard 
	"kdt":	( 0, 9, 0),	#	Korean Daylight 
	"cast":	( 0, 9,30),	#	Central Australian Standard 
	"cadt":	(-1, 9,30),	#	Central Australian Daylight 
	"east":	( 0,10, 0),	#	Eastern Australian Standard 
	"eadt":	(-1,10, 0),	#	Eastern Australian Daylight 
	"nzst":	( 0,12, 0),	#	New Zealand Standard 
	"nzdt":	(-1,12, 0)	#	New Zealand Daylight
	}

# recognizes numeric timezone offsets.
numeric_timezone_reg = regex.compile('[+-]?[0-9]?[0-9][0-9][0-9]')

# tzn[+|-]hh[:mm[:ss]][dzn]
# tzn := 3-letter tzname
# dzn := 3-letter daylight zone name
# group 1 holds the hh[:mm[:ss]] string.
symbolic_timezone_reg = \
   regex.compile("[a-zA-Z][a-zA-Z][a-zA-Z]"+
				 "\\(\\([+-]?[0-9]+\\)\\(:[0-9]+\\(:[0-9]+\\)?\\)?\\)"+
				 "\\([a-zA-Z][a-zA-Z][a-zA-Z]\\)?")

# return the offset, in minutes, of this timezone
def parse_timezone (tz):
	# this is the preferred format: -0500, +0930
	if numeric_timezone_reg.match (tz) != -1:
		min  = string.atoi (tz[-2:])
		hour = string.atoi (tz[:-2])
		return (3600 * hour) + 60 * min

	tz = string.lower (tz)
	# semi-standard timezone name: EST, PST, MET
	# use table lookup
	if zone_map.has_key (tz):
		dstoff,hour,min = zone_map[tz]
		return (3600 * -dstoff) + (3600 * hour) + 60* min

	# TZ environment variable format,
	# not legal in rfc822 or rfc1036, but it shows
	# up occasionally.
	# note that the numeric offset is negative
	if symbolic_timezone_reg.match (tz) != -1:
		hms = map (string.atoi, string.splitfields (symbolic_timezone_reg.group(1), ':'))
		factor = 3600
		offset = 0
		for x in hms:
			offset = offset + factor * x
			factor = factor / 60
		return -offset
	raise error, "couldn't parse timezone"

month_table = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
			   'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

def parsedate (date):
	yy,mm,dd,tt,tz = partition_date (date)
	if len(yy) == 2:
		yy = string.atoi('19'+yy)
	else:
		yy = string.atoi(yy)
	mm = string.lower (mm)
	if not mm in month_table:
		raise error, "bogus month name"
	else:
		mm = month_table.index (mm)+1
	dd = string.atoi (dd)
	tt = parse_time (tt)
	if tz:
		try:
			tzoff = parse_timezone (tz)
		except:
			print "couldn't parse timezone"
			tzoff = None
	else:
		tzoff = None
	# time.timezone is thrown in because mktime is relative to the local tz
	if tzoff != None:
		return time.mktime((yy,mm,dd,tt[0],tt[1],tt[2],0,0,0))-(time.timezone+tzoff), (yy,mm,dd,tt,tz)
	# the final -1 arg means 'let the system decide if DST applies for this date'
	else:
		return time.mktime((yy,mm,dd,tt[0],tt[1],tt[2],0,0,-1)), (yy,mm,dd,tt,tz)

# def test():
# 	import parsdate
# 	fd = open('f:/tmp/dates.txt', 'r')
# 	while 1:
# 		line = fd.readline()
# 		if line == '':
# 			break
# 		line = line[6:-1]
# 		try:
# 			date1 = parsedate (line)
# 		except error, why:
# 			print why, line
# 			break
# 		date2 = parsdate.parsedate (line)
# 		if date2 == -1:
# 			print 'parsdate parse failed on '+line
# 		else:
# 			if date1 != date2:
# 				print 'difference: '+line
# 		print '.'
#
# dates seen but not yet handled (violators of rfc822 in a bad way)
# (from amdahl.com) 'Friday, 7 January 1994 07:03 PT'
# (from psi.com) dates with a TZ of 'U'... the code punts to GMT.

