Module parse
[hide private]
[frames] | no frames]

Source Code for Module parse

  1  """ 
  2  Plan: 
  3  Parse: 
  4  ip,bytesTx,referrer,statuscode,user-agent 
  5  datetime * 
  6   
  7  Insert: 
  8   
  9   
 10  Features: 
 11  possible broken links 
 12  request flooding 
 13  page's hits by weeks 
 14  page's hits throughout the day hence possible audience 
 15  """ 
 16   
 17   
 18   
 19  import time 
 20  import sqlite3 
 21  import re 
 22  from datetime import datetime 
 23           
24 -class Parser:
25 """This class provides functionality to parse the server log file and insert it into sqlite database. 26 Database table is deleted during creation of the each instance of this class 27 """
28 - def __init__(self,logfilename=None,dbfilelocation=None):
29 """ 30 This constructor creates an instance of Parser class. It require two parameters server log file path and database file location 31 32 @param logfilename: path of server log file 33 @param dbfilelocation: location of the sqlite database file to create database 34 """ 35 36 if logfilename!=None: 37 __logfilename__=logfilename 38 39 if dbfilelocation!=None: 40 __dbfilename__=dbfilelocation 41 resetDb() 42 conn=sqlite3.connect(dbfile()) 43 fp=file(logfile()) 44 45 #read 5 mb at a time into tuples 46 tuples=self.parseLines(fp.readlines(5000000)) 47 while tuples!=[]: 48 conn.executemany("""INSERT INTO serverlog VALUES(?,?,?,?,?,?,?,?,?)""",tuples) 49 tuples=self.parseLines(fp.readlines(5000000)) 50 51 conn.commit() 52 conn.close() 53 fp.close()
54 55 #inserted successfully into db 56
57 - def parseLines(self,logLines):
58 """This function takes iteratable collection of lines from server log file. 59 It returns the list of tuples in format t(ip,datetime,request-method,request-path,request-param,user-agent,status-code,bytesTx,referrer) 60 61 @param logLines: iteratable collection of lines from server log file 62 @return: list of tuples 63 """ 64 65 lValues=[] 66 for line in logLines: 67 d={} 68 69 #regex for ip 70 reIp=re.compile('^[^ ]+') #reIp=re.compile('^[0-9\.]*') 71 finds=reIp.findall(line) 72 captIp=finds[0] 73 d.update({'ip':captIp}) #insert into dict 74 75 76 #regex for all double quoted strings ie request string,referrer, user-agent or other invalid value 77 reQuoted=re.compile('\\"[^\\"]*\\"') 78 found=reQuoted.findall(line) 79 if len(finds)>0: 80 #remove leading and trailling quotes 81 captQuoted=[quotedStr.strip('"') for quotedStr in found] 82 request=captQuoted[0] 83 ## correct request string test ## 84 assert len(request.split())==3 85 86 request=request.split() #split request string for more details 87 requestMethod=request[0] 88 requestPath=request[1] 89 90 requestPath=requestPath.split('?') 91 requestParam='-' 92 if len(requestPath)==2: 93 requestParam=requestPath[1] 94 95 requestPath=requestPath[0] 96 97 d.update({'request':captQuoted[0],'referrer':captQuoted[1],'user-agent':captQuoted[2],'request-method':requestMethod,'request-path':requestPath,'request-param':requestParam}) 98 99 100 #regex for status code and txbytes 101 reNums=re.compile(' (\d+) (\d+|-) ') 102 found=reNums.findall(line)[0] 103 104 ## parsing test ## 105 assert len(found)==2 106 107 status_code=found[0] 108 bytesTx=found[1] 109 ##parsing test## 110 int(status_code) 111 if bytesTx!='-': 112 int(bytesTx) 113 d.update({'status-code':status_code,'bytesTx':bytesTx}) #update dict 114 115 116 #parse datetime# 117 dt=self.__parseDateTime__(line) 118 d.update({'datetime':dt}) 119 120 lValues.append((d['ip'],d['datetime'],d['request-method'], 121 d['request-path'],d['request-param'],d['user-agent'], 122 d['status-code'],d['bytesTx'],d['referrer'])) #TODO add datetime object too 123 return lValues
124
125 - def __parseDateTime__(self,logLine):
126 """This class is used to parse the date and time from the line of logfile. Intended for the internal use 127 @param logLine: a line of logfile 128 @return: python datetime object 129 """ 130 #regex for datetime# 131 reDate=re.compile('\[(.+)\]') 132 strDate=reDate.findall(logLine)[0].split()[0] 133 dt=datetime.strptime(strDate, "%d/%b/%Y:%H:%M:%S") #date format 134 return dt
135
136 - def __Testparsedate__(self):
137 """This file is generated during the testing of datetime parsing. Intended for internal use 138 @return: Nothing 139 """ 140 dateString='15/Dec/2008:17:17:34 +0530' 141 dt=datetime.strptime("15/Dec/2008:17:00:26", "%d/%b/%Y:%H:%M:%S") 142 143 return
144
145 - def __ParseTest__(self):
146 """This file is generated during the testing of logfile parsing. Intended for internal use 147 @return: Nothing 148 """ 149 #### ip ####### 150 reIp=re.compile('^[0-9\.]*') 151 finds=reIp.findall('10.105.2.6 - -') 152 captIp=finds[0] 153 print captIp 154 155 156 #### ######## 157 158 reTime=re.compile('\\[') 159 match=reTime.match('10 - - [1234 asd]12345[98766e]') 160 print match 161 capTime=match.groups() 162 print capTime 163 164 165 #### req ####### 166 reReq=re.compile('\\"[^\\"]*\\"') 167 finds=reReq.findall(lines[0]) 168 if len(finds)>0: 169 captQuoted=finds[0] 170 print finds 171 print captQuoted 172 striped= [ find.strip('"') for find in finds] 173 print striped 174 175 captQuoted=[quotedStr.strip('"') for quotedStr in captQuoted] 176 print captQuoted 177 d={'referrer':captQuoted[1],'user-agent':captQuoted[2]} 178 179 180 #regex for status code and txbytes 181 #reNums=re.compile(' ([0-9]+) ([0-9]+|-) ') 182 reNums=re.compile(' (\d+) (\d+|-) ') 183 #matches=reNums.match(lines[0]) 184 #print matches 185 186 187 #using findall 188 found=reNums.findall(lines[0])[0] 189 print found #print 190 assert len(found)==2 or len(found)==1 #test 191 status_code=found[0] 192 bytesTx=found[1] 193 194 print status_code 195 print bytesTx 196 197 return
198 199 200 """File path of the database file""" 201 __dbfilename__='test.db' 202
203 -def dbfile():
204 """ 205 retutn: File path of the database file 206 """ 207 return __dbfilename__
208 209 """File path of the server log file""" 210 __logfilename__='access_log' 211
212 -def logfile():
213 """ 214 return: File path of the server log file""" 215 return __logfilename__
216
217 -def adapt_datetime(ts):
218 """Date time adapter for databse 219 """ 220 return time.mktime(ts.timetuple())
221 222
223 -def resetDb():
224 """This function deletes the previously created table from the database and creates a fresh new table having name serverlog 225 """ 226 conn=sqlite3.connect(dbfile()) 227 conn.execute('drop table serverlog;') 228 conn.execute("""create table serverlog(ip varchar(15),requestdate date,request_method varchar(4),request_path varchar(255),request_param varchar(255),user_agent varchar(100),status_code char(3), bytesTx integer, referrer varchar(255)) ;""") 229 conn.close()
230 231 print 'Hello World' 232 233 234 print 'Finished' 235