Skip to content

Commit 568a859

Browse files
authored
Merge pull request #31 from MikeMeliz/refactoring
Refactoring & Improvements
2 parents 246d9aa + 63c1e3c commit 568a859

File tree

10 files changed

+119
-204
lines changed

10 files changed

+119
-204
lines changed

.gitignore

Lines changed: 4 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
# Project Specific
2+
output/*
3+
!output/.gitkeep
4+
15
# Byte-compiled / optimized / DLL files
26
__pycache__/
37
*.py[cod]
48
*$py.class
59

6-
# C extensions
7-
*.so
8-
910
# Distribution / packaging
1011
.Python
1112
build/
@@ -26,12 +27,6 @@ share/python-wheels/
2627
*.egg
2728
MANIFEST
2829

29-
# PyInstaller
30-
# Usually these files are written by a python script from a template
31-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32-
*.manifest
33-
*.spec
34-
3530
# Installer logs
3631
pip-log.txt
3732
pip-delete-this-directory.txt
@@ -51,74 +46,6 @@ coverage.xml
5146
.pytest_cache/
5247
cover/
5348

54-
# Translations
55-
*.mo
56-
*.pot
57-
58-
# Django stuff:
59-
*.log
60-
local_settings.py
61-
db.sqlite3
62-
db.sqlite3-journal
63-
64-
# Flask stuff:
65-
instance/
66-
.webassets-cache
67-
68-
# Scrapy stuff:
69-
.scrapy
70-
71-
# Sphinx documentation
72-
docs/_build/
73-
74-
# PyBuilder
75-
.pybuilder/
76-
target/
77-
78-
# Jupyter Notebook
79-
.ipynb_checkpoints
80-
81-
# IPython
82-
profile_default/
83-
ipython_config.py
84-
85-
# pyenv
86-
# For a library or package, you might want to ignore these files since the code is
87-
# intended to run in multiple environments; otherwise, check them in:
88-
# .python-version
89-
90-
# pipenv
91-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94-
# install all needed dependencies.
95-
#Pipfile.lock
96-
97-
# poetry
98-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99-
# This is especially recommended for binary packages to ensure reproducibility, and is more
100-
# commonly ignored for libraries.
101-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102-
#poetry.lock
103-
104-
# pdm
105-
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106-
#pdm.lock
107-
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108-
# in version control.
109-
# https://pdm.fming.dev/#use-with-ide
110-
.pdm.toml
111-
112-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113-
__pypackages__/
114-
115-
# Celery stuff
116-
celerybeat-schedule
117-
celerybeat.pid
118-
119-
# SageMath parsed files
120-
*.sage.py
121-
12249
# Environments
12350
.env
12451
.venv
@@ -128,33 +55,5 @@ ENV/
12855
env.bak/
12956
venv.bak/
13057

131-
# Spyder project settings
132-
.spyderproject
133-
.spyproject
134-
135-
# Rope project settings
136-
.ropeproject
137-
138-
# mkdocs documentation
139-
/site
140-
141-
# mypy
142-
.mypy_cache/
143-
.dmypy.json
144-
dmypy.json
145-
146-
# Pyre type checker
147-
.pyre/
148-
149-
# pytype static type analyzer
150-
.pytype/
151-
152-
# Cython debug symbols
153-
cython_debug/
154-
15558
# PyCharm
156-
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157-
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158-
# and can be added to the global gitignore or merged into this file. For a more nuclear
159-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
16059
.idea/

README.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ $ torcrawl -v -u http://www.github.com/ -c -d 2 -p 2
5151
## Installation
5252

5353
### Easy Installation with pip:
54-
*Comming soon..*
54+
*Coming soon...*
5555

5656
### Manual Installation:
5757
1. **Clone this repository**:<br>
5858
`git clone https://github.com/MikeMeliz/TorCrawl.py.git`
59-
2. **Install dependecies**:<br>
59+
2. **Install dependencies**:<br>
6060
`pip install -r requirements.txt`
6161
3. **Install and Start TOR Service**:
6262
1. **Debian/Ubuntu**: <br>
@@ -82,12 +82,12 @@ $ torcrawl -v -u http://www.github.com/ -c -d 2 -p 2
8282
-f |--folder| The directory which will contain the generated files
8383
**Extract**: | |
8484
-e |--extract| Extract page's code to terminal or file (Default: Terminal)
85-
-i |--input filename| Input file with URL(s) (seperated by line)
85+
-i |--input filename| Input file with URL(s) (separated by line)
8686
-o |--output [filename]| Output page(s) to file(s) (for one page)
8787
-y |--yara | Perform yara keyword search:<br>h = search entire html object,<br>t = search only text
8888
**Crawl**: | |
8989
-c |--crawl| Crawl website (Default output on website/links.txt)
90-
-d |--cdepth| Set depth of crawler's travel (Default: 1)
90+
-d |--depth| Set depth of crawler's travel (Default: 1)
9191
-p |--pause| Seconds of pause between requests (Default: 0)
9292
-l |--log| Log file with visited URLs and their response code
9393

@@ -134,8 +134,7 @@ $ python torcrawl.py -i links.txt
134134

135135

136136
### As Crawler:
137-
Crawl the links of the webpage without the use of TOR,
138-
also show verbose output (really helpfull):
137+
Crawl the links of the webpage without the use of TOR, also show verbose output (really helpful):
139138

140139
```shell
141140
$ python torcrawl.py -v -w -u http://www.github.com/ -c
@@ -216,6 +215,10 @@ Feel free to contribute on this project! Just fork it, make any change on your f
216215

217216
## Changelog
218217
```shell
218+
v1.32:
219+
* Removed 1 second default pause between requests
220+
* Several improvements on results
221+
* Improved logs
219222
v1.31:
220223
* Fixed Input Link NoneType Error
221224
* Fixed name mismatch

modules/checker.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,22 @@ def folder(website, verbose):
5050
5151
:param website: String - URL of website to crawl.
5252
:param verbose: Boolean - Logging level.
53-
:return: String 'out_path' - Path of the output folder.
53+
:return: String 'output_folder' - Path of the output folder.
5454
"""
55-
out_path = website
56-
if not os.path.exists(out_path):
57-
os.makedirs(out_path)
55+
parsed = urlparse(website)
56+
if parsed.scheme != '':
57+
output_folder = "output/" + urlparse(website).netloc
58+
else:
59+
output_folder = "output/" + website
60+
if not os.path.exists(output_folder):
61+
try:
62+
os.makedirs(output_folder)
63+
except FileExistsError:
64+
if verbose:
65+
print(f"## Folder exists already: {website}")
5866
if verbose:
59-
print(f"## Folder created: {out_path}")
60-
return out_path
67+
print(f"## Folder created: {website}")
68+
return output_folder
6169

6270

6371
def check_tor(verbose):
@@ -87,11 +95,11 @@ def check_ip():
8795
""" Checks users IP from external resource.
8896
:return: None or HTTPError
8997
"""
90-
addr = 'https://api.ipify.org/?format=json'
98+
api_address = 'https://api.ipify.org/?format=json'
9199
try:
92-
my_ip = load(urlopen(addr))['ip']
100+
my_ip = load(urlopen(api_address))['ip']
93101
print(f'## Your IP: {my_ip}')
94102
except HTTPError as err:
95103
error = sys.exc_info()[0]
96-
print(f"Error: {error} \n## IP cannot be obtained. \n## Is {addr} up? "
104+
print(f"Error: {error} \n## IP cannot be obtained. \n## Is {api_address} up? "
97105
f"\n## HTTPError: {err}")

modules/crawler.py

Lines changed: 39 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def excludes(self, link):
2626
:param link: String
2727
:return: Boolean
2828
"""
29-
now = datetime.datetime.now().strftime("%Y%m%d")
29+
now = datetime.datetime.now().strftime("%y%m%d")
3030

3131
# BUG: For NoneType Exceptions, got to find a solution here
3232
if link is None:
@@ -36,31 +36,33 @@ def excludes(self, link):
3636
return True
3737
# External links
3838
elif link.startswith('http') and not link.startswith(self.website):
39-
file_path = self.out_path + '/' + now + '_extlinks.txt'
40-
with open(file_path, 'w+', encoding='UTF-8') as lst_file:
39+
file_path = self.out_path + '/' + now + '_ext-links.txt'
40+
with open(file_path, 'a+', encoding='UTF-8') as lst_file:
4141
lst_file.write(str(link) + '\n')
4242
return True
4343
# Telephone Number
4444
elif link.startswith('tel:'):
4545
file_path = self.out_path + '/' + now + '_telephones.txt'
46-
with open(file_path, 'w+', encoding='UTF-8') as lst_file:
46+
with open(file_path, 'a+', encoding='UTF-8') as lst_file:
4747
lst_file.write(str(link) + '\n')
4848
return True
4949
# Mails
5050
elif link.startswith('mailto:'):
5151
file_path = self.out_path + '/' + now + '_mails.txt'
52-
with open(file_path, 'w+', encoding='UTF-8') as lst_file:
52+
with open(file_path, 'a+', encoding='UTF-8') as lst_file:
5353
lst_file.write(str(link) + '\n')
5454
return True
5555
# Type of files
56-
elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link,
57-
re.IGNORECASE):
56+
elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE):
57+
file_path = self.out_path + '/' + now + '_files.txt'
58+
with open(file_path, 'a+', encoding='UTF-8') as lst_file:
59+
lst_file.write(str(link) + '\n')
5860
return True
5961

6062
def canonical(self, link):
6163
""" Canonicalization of the link.
6264
63-
:param link: String
65+
:param link: String: URL(s)
6466
:return: String 'final_link': parsed canonical url.
6567
"""
6668
# Already formatted
@@ -83,6 +85,20 @@ def canonical(self, link):
8385
final_link = self.website + "/" + link
8486
return final_link
8587

88+
def write_log(self, log):
89+
log_path = self.out_path + '/crawler.log'
90+
now = datetime.datetime.now()
91+
92+
if self.logs is True:
93+
open(log_path, 'a+')
94+
if self.logs is True and os.access(log_path, os.W_OK) is False:
95+
print(f"## Unable to write to {self.out_path}/log.txt - Exiting")
96+
sys.exit(2)
97+
with open(log_path, 'a+', encoding='UTF-8') as log_file:
98+
log_file.write(str(now) + " [crawler.py] " + log)
99+
log_file.close()
100+
101+
86102
def crawl(self):
87103
""" Core of the crawler.
88104
:return: List (ord_lst) - List of crawled links.
@@ -91,11 +107,6 @@ def crawl(self):
91107
ord_lst = []
92108
ord_lst.insert(0, self.website)
93109
ord_lst_ind = 0
94-
log_path = self.out_path + '/log.txt'
95-
96-
if self.logs is True and os.access(log_path, os.W_OK) is False:
97-
print(f"## Unable to write to {self.out_path}/log.txt - Exiting")
98-
sys.exit(2)
99110

100111
print(f"## Crawler started from {self.website} with "
101112
f"{str(self.c_depth)} depth crawl, and {str(self.c_pause)} "
@@ -113,27 +124,23 @@ def crawl(self):
113124
if item is not None:
114125
html_page = urllib.request.urlopen(item)
115126
except (HTTPError, URLError) as error:
116-
print('## ERROR: Domain or link seems to be '
117-
'unreachable. Add -v to see the verbose error.'
118-
'Or write the full URL at -u argument!')
119-
if self.verbose: print(error)
127+
self.write_log(f"[INFO] ERROR: Domain or link seems to be unreachable: {str(item)} | "
128+
f"Message: {error}\n")
120129
continue
121130
else:
122131
try:
123132
html_page = urllib.request.urlopen(self.website)
124133
ord_lst_ind += 1
125134
except (HTTPError, URLError) as error:
126-
print('## ERROR: Domain or link seems to be '
127-
'unreachable. Add -v to see the verbose error.'
128-
'Or write the full URL at -u argument!')
129-
if self.verbose: print(error)
135+
self.write_log(f"[INFO] ERROR: Domain or link seems to be unreachable: {str(item)} | "
136+
f"Message: {error}\n")
130137
ord_lst_ind += 1
131138
continue
132139

133140
try:
134141
soup = BeautifulSoup(html_page, features="html.parser")
135-
except TypeError as err:
136-
print(f"## Soup Error Encountered:: could to parse "
142+
except TypeError:
143+
print(f"## Soup Error Encountered:: couldn't parse "
137144
f"ord_list # {ord_lst_ind}::{ord_lst[ord_lst_ind]}")
138145
continue
139146

@@ -159,7 +166,7 @@ def crawl(self):
159166
if ver_link is not None:
160167
lst.add(ver_link)
161168

162-
# TODO: For non-formal links, using RegEx
169+
# TODO: For non-formal links, using RegEx, should be an additional parameter, and all patterns to be stored in a file
163170
# url_pattern = r'/(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])/igm'
164171
# html_content = urllib.request.urlopen(self.website).read().decode('utf-8')
165172

@@ -180,21 +187,19 @@ def crawl(self):
180187
ord_lst = ord_lst + list(set(lst))
181188
ord_lst = list(set(ord_lst))
182189

190+
# Keeps logs for every webpage visited.
191+
page_code = html_page.status
192+
url_visited = f"[{str(page_code)}] {str(item)} \n"
193+
self.write_log("[INFO] Parsed: " + url_visited)
194+
183195
if self.verbose:
184-
sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r")
196+
sys.stdout.write(" -- Results: " + str(len(ord_lst)) + "\r")
185197
sys.stdout.flush()
186198

187-
# Pause time.
188-
if (ord_lst.index(item) != len(ord_lst) - 1) and \
189-
float(self.c_pause) > 0:
199+
# Add Pause time between each iteration
200+
if (ord_lst.index(item) != len(ord_lst) - 1) and float(self.c_pause) > 0:
190201
time.sleep(float(self.c_pause))
191202

192-
# Keeps logs for every webpage visited.
193-
if self.logs:
194-
it_code = html_page.getcode()
195-
with open(log_path, 'w+', encoding='UTF-8') as log_file:
196-
log_file.write(f"[{str(it_code)}] {str(item)} \n")
197-
198203
print(f"## Step {str(index + 1)} completed "
199204
f"with: {str(len(ord_lst))} result(s)")
200205

0 commit comments

Comments
 (0)