@@ -26,7 +26,7 @@ def excludes(self, link):
26
26
:param link: String
27
27
:return: Boolean
28
28
"""
29
- now = datetime .datetime .now ().strftime ("%Y %m%d" )
29
+ now = datetime .datetime .now ().strftime ("%y %m%d" )
30
30
31
31
# BUG: For NoneType Exceptions, got to find a solution here
32
32
if link is None :
@@ -36,31 +36,33 @@ def excludes(self, link):
36
36
return True
37
37
# External links
38
38
elif link .startswith ('http' ) and not link .startswith (self .website ):
39
- file_path = self .out_path + '/' + now + '_extlinks .txt'
40
- with open (file_path , 'w +' , encoding = 'UTF-8' ) as lst_file :
39
+ file_path = self .out_path + '/' + now + '_ext-links .txt'
40
+ with open (file_path , 'a +' , encoding = 'UTF-8' ) as lst_file :
41
41
lst_file .write (str (link ) + '\n ' )
42
42
return True
43
43
# Telephone Number
44
44
elif link .startswith ('tel:' ):
45
45
file_path = self .out_path + '/' + now + '_telephones.txt'
46
- with open (file_path , 'w +' , encoding = 'UTF-8' ) as lst_file :
46
+ with open (file_path , 'a +' , encoding = 'UTF-8' ) as lst_file :
47
47
lst_file .write (str (link ) + '\n ' )
48
48
return True
49
49
# Mails
50
50
elif link .startswith ('mailto:' ):
51
51
file_path = self .out_path + '/' + now + '_mails.txt'
52
- with open (file_path , 'w +' , encoding = 'UTF-8' ) as lst_file :
52
+ with open (file_path , 'a +' , encoding = 'UTF-8' ) as lst_file :
53
53
lst_file .write (str (link ) + '\n ' )
54
54
return True
55
55
# Type of files
56
- elif re .search ('^.*\\ .(pdf|jpg|jpeg|png|gif|doc)$' , link ,
57
- re .IGNORECASE ):
56
+ elif re .search ('^.*\\ .(pdf|jpg|jpeg|png|gif|doc)$' , link , re .IGNORECASE ):
57
+ file_path = self .out_path + '/' + now + '_files.txt'
58
+ with open (file_path , 'a+' , encoding = 'UTF-8' ) as lst_file :
59
+ lst_file .write (str (link ) + '\n ' )
58
60
return True
59
61
60
62
def canonical (self , link ):
61
63
""" Canonicalization of the link.
62
64
63
- :param link: String
65
+ :param link: String: URL(s)
64
66
:return: String 'final_link': parsed canonical url.
65
67
"""
66
68
# Already formatted
@@ -83,6 +85,20 @@ def canonical(self, link):
83
85
final_link = self .website + "/" + link
84
86
return final_link
85
87
88
+ def write_log (self , log ):
89
+ log_path = self .out_path + '/crawler.log'
90
+ now = datetime .datetime .now ()
91
+
92
+ if self .logs is True :
93
+ open (log_path , 'a+' )
94
+ if self .logs is True and os .access (log_path , os .W_OK ) is False :
95
+ print (f"## Unable to write to { self .out_path } /log.txt - Exiting" )
96
+ sys .exit (2 )
97
+ with open (log_path , 'a+' , encoding = 'UTF-8' ) as log_file :
98
+ log_file .write (str (now ) + " [crawler.py] " + log )
99
+ log_file .close ()
100
+
101
+
86
102
def crawl (self ):
87
103
""" Core of the crawler.
88
104
:return: List (ord_lst) - List of crawled links.
@@ -91,11 +107,6 @@ def crawl(self):
91
107
ord_lst = []
92
108
ord_lst .insert (0 , self .website )
93
109
ord_lst_ind = 0
94
- log_path = self .out_path + '/log.txt'
95
-
96
- if self .logs is True and os .access (log_path , os .W_OK ) is False :
97
- print (f"## Unable to write to { self .out_path } /log.txt - Exiting" )
98
- sys .exit (2 )
99
110
100
111
print (f"## Crawler started from { self .website } with "
101
112
f"{ str (self .c_depth )} depth crawl, and { str (self .c_pause )} "
@@ -113,27 +124,23 @@ def crawl(self):
113
124
if item is not None :
114
125
html_page = urllib .request .urlopen (item )
115
126
except (HTTPError , URLError ) as error :
116
- print ('## ERROR: Domain or link seems to be '
117
- 'unreachable. Add -v to see the verbose error.'
118
- 'Or write the full URL at -u argument!' )
119
- if self .verbose : print (error )
127
+ self .write_log (f"[INFO] ERROR: Domain or link seems to be unreachable: { str (item )} | "
128
+ f"Message: { error } \n " )
120
129
continue
121
130
else :
122
131
try :
123
132
html_page = urllib .request .urlopen (self .website )
124
133
ord_lst_ind += 1
125
134
except (HTTPError , URLError ) as error :
126
- print ('## ERROR: Domain or link seems to be '
127
- 'unreachable. Add -v to see the verbose error.'
128
- 'Or write the full URL at -u argument!' )
129
- if self .verbose : print (error )
135
+ self .write_log (f"[INFO] ERROR: Domain or link seems to be unreachable: { str (item )} | "
136
+ f"Message: { error } \n " )
130
137
ord_lst_ind += 1
131
138
continue
132
139
133
140
try :
134
141
soup = BeautifulSoup (html_page , features = "html.parser" )
135
- except TypeError as err :
136
- print (f"## Soup Error Encountered:: could to parse "
142
+ except TypeError :
143
+ print (f"## Soup Error Encountered:: couldn't parse "
137
144
f"ord_list # { ord_lst_ind } ::{ ord_lst [ord_lst_ind ]} " )
138
145
continue
139
146
@@ -159,7 +166,7 @@ def crawl(self):
159
166
if ver_link is not None :
160
167
lst .add (ver_link )
161
168
162
- # TODO: For non-formal links, using RegEx
169
+ # TODO: For non-formal links, using RegEx, should be an additional parameter, and all patterns to be stored in a file
163
170
# url_pattern = r'/(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])/igm'
164
171
# html_content = urllib.request.urlopen(self.website).read().decode('utf-8')
165
172
@@ -180,21 +187,19 @@ def crawl(self):
180
187
ord_lst = ord_lst + list (set (lst ))
181
188
ord_lst = list (set (ord_lst ))
182
189
190
+ # Keeps logs for every webpage visited.
191
+ page_code = html_page .status
192
+ url_visited = f"[{ str (page_code )} ] { str (item )} \n "
193
+ self .write_log ("[INFO] Parsed: " + url_visited )
194
+
183
195
if self .verbose :
184
- sys .stdout .write ("-- Results: " + str (len (ord_lst )) + "\r " )
196
+ sys .stdout .write (" -- Results: " + str (len (ord_lst )) + "\r " )
185
197
sys .stdout .flush ()
186
198
187
- # Pause time.
188
- if (ord_lst .index (item ) != len (ord_lst ) - 1 ) and \
189
- float (self .c_pause ) > 0 :
199
+ # Add Pause time between each iteration
200
+ if (ord_lst .index (item ) != len (ord_lst ) - 1 ) and float (self .c_pause ) > 0 :
190
201
time .sleep (float (self .c_pause ))
191
202
192
- # Keeps logs for every webpage visited.
193
- if self .logs :
194
- it_code = html_page .getcode ()
195
- with open (log_path , 'w+' , encoding = 'UTF-8' ) as log_file :
196
- log_file .write (f"[{ str (it_code )} ] { str (item )} \n " )
197
-
198
203
print (f"## Step { str (index + 1 )} completed "
199
204
f"with: { str (len (ord_lst ))} result(s)" )
200
205
0 commit comments