diff --git a/README.md b/README.md index 0aac161..3003df8 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ arg | Long | Description -e |--extract| Extract page's code to terminal or file. (Default: Terminal) -i |--input filename| Input file with URL(s) (seperated by line) -o |--output [filename]| Output page(s) to file(s) (for one page) +-y |--yara | Perform yara keyword search (0 = search entire html object. 1 = search only text). **Crawl**: | | -c |--crawl| Crawl website (Default output on /links.txt) -d |--cdepth| Set depth of crawl's travel (Default: 1) @@ -98,6 +99,14 @@ $ python torcrawl.py -u http://www.github.com | grep 'google-analytics' ``` +Extract to file and find only the line with google-analytics using yara: +```shell +$ python torcrawl.py -v -w -u https://github.com -e -y 0 +... +``` +**_Note:_** update res/keyword.yar to search for other keywords. +Use ```-y 0``` for raw html searching and ```-y 1``` for text search only. + Extract a set of webpages (imported from file) to terminal: ```shell @@ -156,6 +165,24 @@ $ python torcrawl.py -u http://www.github.com/ -c -e | grep '' ... ``` +### As Both + Keyword Search: +You can crawl a page, perform a keyword search and extract the webpages that match the findings into a folder with a single command: + +```shell +$ python torcrawl.py -v -u http://www.github.com/ -c -d 2 -p 5 -e -y 0 +## TOR is ready! +## URL: http://www.github.com/ +## Your IP: *.*.*.* +## Crawler Started from http://www.github.com with step 1 and wait 5 +## Step 1 completed with: 11 results +## File created on /script/path/FolderName/index.htm +## File created on /script/path/FolderName/projects.html +## ... +``` + +***Note:*** *Update res/keyword.yar to search for other keywords. +Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.* + ## Demo: ![peek 2018-12-08 16-11](https://user-images.githubusercontent.com/9204902/49687660-f72f8280-fb0e-11e8-981e-1bbeeac398cc.gif) diff --git a/modules/crawler.py b/modules/crawler.py index 6594cf8..78b56a4 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -136,7 +136,8 @@ def crawl(self): continue ver_link = self.canonical(link) - lst.add(ver_link) + if ver_link is not None: + lst.add(ver_link) # For each tag. for link in soup.findAll('area'): @@ -146,7 +147,8 @@ def crawl(self): continue ver_link = self.canonical(link) - lst.add(ver_link) + if ver_link is not None: + lst.add(ver_link) # TODO: For images # TODO: For scripts diff --git a/modules/extractor.py b/modules/extractor.py index 5c2e422..d13310f 100644 --- a/modules/extractor.py +++ b/modules/extractor.py @@ -1,28 +1,68 @@ #!/usr/bin/python import io import os -import sys +import yara as _yara import urllib.error import urllib.parse import urllib.request from urllib.error import HTTPError from urllib.error import URLError +from http.client import InvalidURL +from http.client import IncompleteRead +from bs4 import BeautifulSoup -def cinex(input_file, out_path): + +def text(response=None): + """ Removes all the garbage from the HTML and takes only text elements + from the page. + + :param response: HTTP Response. + :return: String: Text only stripped response. + """ + soup = BeautifulSoup(response, features="lxml") + for s in soup(['script', 'style']): + s.decompose() + + return ' '.join(soup.stripped_strings) + + +def check_yara(raw=None, yara=0): + """ Validates Yara Rule to categorize the site and check for keywords. + + :param raw: HTTP Response body. + :param yara: Integer: Keyword search argument. + :return matches: List of yara rule matches. + """ + + file_path = os.path.join('res/keywords.yar') + + if raw is not None: + if yara == 1: + raw = text(response=raw).lower() + + file = os.path.join(file_path) + rules = _yara.compile(file) + matches = rules.match(data=raw) + if len(matches) != 0: + print("found a match!") + return matches + + +def cinex(input_file, out_path, yara=None): """ Ingests the crawled links from the input_file, scrapes the contents of the resulting web pages and writes the contents to the into out_path/{url_address}. :param input_file: String: Filename of the crawled Urls. :param out_path: String: Pathname of results. + :param yara: Integer: Keyword search argument. :return: None """ file = io.TextIOWrapper try: file = open(input_file, 'r') except IOError as err: - # error = sys.exc_info()[0] print(f"Error: {err}\n## Can't open: {input_file}") for line in file: @@ -40,70 +80,111 @@ def cinex(input_file, out_path): print(f"Error: {error}") continue - # Extract page to file + # Extract page to file. try: + content = urllib.request.urlopen(line, timeout=10).read() + + if yara is not None: + full_match_keywords = check_yara(content, yara) + + if len(full_match_keywords) == 0: + print('No matches found.') + continue + with open(out_path + "/" + output_file, 'wb') as results: - results.write(urllib.request.urlopen(line).read()) + results.write(content) print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}") + except HTTPError as e: + print(f"Cinex Error: {e.code}, cannot access: {e.url}") + continue + except InvalidURL as e: + print(f"Invalid URL: {line} \n Skipping...") + continue + except IncompleteRead as e: + print(f"IncompleteRead on {line}") + continue except IOError as err: - error = sys.exc_info()[0] - print(f"Error: {error}\nCan't write on file: {output_file}") + print(f"Error: {err}\nCan't write on file: {output_file}") file.close() -def intermex(input_file): +def intermex(input_file, yara): """ Input links from file and extract them into terminal. :param input_file: String: File name of links file. + :param yara: Integer: Keyword search argument. :return: None """ try: with open(input_file, 'r') as file: for line in file: - print((urllib.request.urlopen(line).read())) - except (HTTPError, URLError) as err: - print(f"HTTPError: {err}") + content = urllib.request.urlopen(line).read() + if yara is not None: + full_match_keywords = check_yara(raw=content, yara=yara) + + if len(full_match_keywords) == 0: + print(f"No matches in: {line}") + print(content) + except (HTTPError, URLError, InvalidURL) as err: + print(f"Request Error: {err}") except IOError as err: - # error = sys.exc_info()[0] print(f"Error: {err}\n## Not valid file") -def outex(website, output_file, out_path): +def outex(website, output_file, out_path, yara): """ Scrapes the contents of the provided web address and outputs the contents to file. :param website: String: Url of web address to scrape. :param output_file: String: Filename of the results. :param out_path: String: Folder name of the output findings. + :param yara: Integer: Keyword search argument. :return: None """ # Extract page to file try: output_file = out_path + "/" + output_file + content = urllib.request.urlopen(website).read() + + if yara is not None: + full_match_keywords = check_yara(raw=content, yara=yara) + + if len(full_match_keywords) == 0: + print(f"No matches in: {website}") + with open(output_file, 'wb') as file: - file.write(urllib.request.urlopen(website).read()) + file.write(content) print(f"## File created on: {os.getcwd()}/{output_file}") - except (HTTPError, URLError) as err: + except (HTTPError, URLError, InvalidURL) as err: print(f"HTTPError: {err}") except IOError as err: - # error = sys.exc_info()[0] print(f"Error: {err}\n Can't write on file: {output_file}") -def termex(website): +def termex(website, yara): """ Scrapes provided web address and prints the results to the terminal. :param website: String: URL of website to scrape. + :param yara: Integer: Keyword search argument. :return: None """ try: - print((urllib.request.urlopen(website).read())) - except (urllib.error.HTTPError, urllib.error.URLError) as err: + content = urllib.request.urlopen(website).read() + if yara is not None: + full_match_keywords = check_yara(content, yara) + + if len(full_match_keywords) == 0: + # No match. + print(f"No matches in: {website}") + return + + print(content) + except (HTTPError, URLError, InvalidURL) as err: print(f"Error: ({err}) {website}") return -def extractor(website, crawl, output_file, input_file, out_path): +def extractor(website, crawl, output_file, input_file, out_path, yara): """ Extractor - scrapes the resulting website or discovered links. :param website: String: URL of website to scrape. @@ -112,19 +193,20 @@ def extractor(website, crawl, output_file, input_file, out_path): :param output_file: String: Filename of resulting output from scrape. :param input_file: String: Filename of crawled/discovered URLs :param out_path: String: Dir path for output files. + :param yara: Integer: keyword search option. :return: None """ # TODO: Return output to torcrawl.py if len(input_file) > 0: if crawl: - cinex(input_file, out_path) + cinex(input_file, out_path, yara) # TODO: Extract from list into a folder # elif len(output_file) > 0: # inoutex(website, input_ile, output_file) else: - intermex(input_file) + intermex(input_file, yara) else: if len(output_file) > 0: - outex(website, output_file, out_path) + outex(website, output_file, out_path, yara) else: - termex(website) + termex(website, yara) diff --git a/requirements.txt b/requirements.txt index 53c3735..4b4d31e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ pysocks beautifulsoup4>=4.7.1 requests>=2.21.0 +yara +lxml \ No newline at end of file diff --git a/res/keywords.yar b/res/keywords.yar new file mode 100644 index 0000000..379f51e --- /dev/null +++ b/res/keywords.yar @@ -0,0 +1,34 @@ +/* + Yara. +*/ + +/* + rule email_filter + { + meta: + author = "@the-siegfried" + score = 20 + strings: + $email_add = /\b[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)*\.[a-zA-Z-]+[\w-]\b/ + condition: + any of them + + } +*/ + +rule keyword_search +{ + meta: + author = "@the-siegfried" + score = 90 + + strings: + $a = "Keyword1" fullword wide ascii nocase + $b = "Keyword Two" wide ascii nocase + $c = "kw 3" ascii + $d = "KEYWORD four" nocase + $e = "google-" nocase + + condition: + any of them +} \ No newline at end of file diff --git a/torcrawl.py b/torcrawl.py index a7a0157..2fdf6d1 100755 --- a/torcrawl.py +++ b/torcrawl.py @@ -20,6 +20,9 @@ (Defualt: terminal) -i, --input filename : Input file with URL(s) (seperated by line) -o, --output [filename] : Output page(s) to file(s) (for one page) +-y, --yara : Yara keyword search page categorisation + read in from /res folder. 0 search whole html object. + 1 search only the text. Crawl: -c, --crawl : Crawl website (Default output on /links.txt) @@ -155,6 +158,12 @@ def main(): '--folder', help='The root directory which will contain the generated files' ) + parser.add_argument( + '-y', + '--yara', + help='Check for keywords and only scrape documents that contain a ' + 'match. 0 search whole html object. 1 search only the text.' + ) args = parser.parse_args() @@ -164,6 +173,9 @@ def main(): c_depth = args.cdepth if args.cdepth else 0 c_pause = args.cpause if args.cpause else 1 + if int(args.yara) not in [0, 1]: + parser.error("argument -y/--yara: expected argument 0 or 1.") + # Connect to TOR if args.without is False: check_tor(args.verbose) @@ -194,9 +206,11 @@ def main(): print(f"## File created on {os.getcwd()}/{out_path}/links.txt") if args.extract: input_file = out_path + "/links.txt" - extractor(website, args.crawl, output_file, input_file, out_path) + extractor(website, args.crawl, output_file, input_file, out_path, + int(args.yara)) else: - extractor(website, args.crawl, output_file, input_file, out_path) + extractor(website, args.crawl, output_file, input_file, out_path, + int(args.yara)) # Stub to call main method.