From c1d9cc52ab7311610cc7f7b7ff4cea03667e347c Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Thu, 17 Mar 2022 23:36:53 +0000
Subject: [PATCH 1/7] Implement Yara Keyword Search

- Adds new argument '-y' to accept yara switch.
- Implements new 'check_yara' method in extractor module to check for keyword matches from .yar file.
- Implements new 'text' method to extract only the text elements from the http content response for yara parsing.
- Amends extractor methods to accept new 'yara' argument and utilise new 'check_yara' and 'text' methods.

See Issue: #14
---
 modules/extractor.py | 119 ++++++++++++++++++++++++++++++++++++-------
 requirements.txt     |   1 +
 torcrawl.py          |  13 ++++-
 3 files changed, 112 insertions(+), 21 deletions(-)

diff --git a/modules/extractor.py b/modules/extractor.py
index 5c2e422..299ef4b 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -2,20 +2,54 @@
 import io
 import os
 import sys
+import yara as _yara
 import urllib.error
 import urllib.parse
 import urllib.request
 from urllib.error import HTTPError
 from urllib.error import URLError
+from http.client import InvalidURL
+from http.client import IncompleteRead
 
+from bs4 import BeautifulSoup
 
-def cinex(input_file, out_path):
+
+def check_yara(raw=None, cat=False):
+    """
+        Validates Yara Rule to categorize the site and check for keywords.
+    """
+
+    file_path = os.path.join('res/keywords.yar')
+    if cat:
+        file_path = os.path.join('res/categories.yar')
+
+    if raw is not None:
+        file = os.path.join(file_path)
+        rules = _yara.compile(file)
+        matches = rules.match(data=raw)
+        return matches
+
+
+def text(response=None):
+    """
+        Removes all the garbage from the HTML and takes only text elements
+        from the page.
+    """
+    soup = BeautifulSoup(response, features="lxml")
+    for s in soup(['script', 'style']):
+        s.decompose()
+
+    return ' '.join(soup.stripped_strings)
+
+
+def cinex(input_file, out_path, yara):
     """ Ingests the crawled links from the input_file,
     scrapes the contents of the resulting web pages and writes the contents to
     the into out_path/{url_address}.
 
     :param input_file: String: Filename of the crawled Urls.
     :param out_path: String: Pathname of results.
+    :param yara: Boolean:
     :return: None
     """
     file = io.TextIOWrapper
@@ -40,48 +74,83 @@ def cinex(input_file, out_path):
             print(f"Error: {error}")
             continue
 
-        # Extract page to file
+        # Extract page to file.
         try:
+            content = urllib.request.urlopen(line, timeout=10).read()
+
+            if yara:
+                full_match_keywords = check_yara(raw=text(
+                    response=content).lower())
+
+                if len(full_match_keywords) == 0:
+                    print('No matches found.')
+                    continue
+
             with open(out_path + "/" + output_file, 'wb') as results:
-                results.write(urllib.request.urlopen(line).read())
+                results.write(content)
             print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}")
+        except HTTPError as e:
+            print(f"Cinex Error: {e.code}, cannot access: {e.url}")
+            continue
+        except InvalidURL as e:
+            print(f"Invalid URL: {line} \n Skipping...")
+            continue
+        except IncompleteRead as e:
+            print(f"IncompleteRead on {line}")
+            continue
         except IOError as err:
-            error = sys.exc_info()[0]
-            print(f"Error: {error}\nCan't write on file: {output_file}")
+            print(f"Error: {err}\nCan't write on file: {output_file}")
     file.close()
 
 
-def intermex(input_file):
+def intermex(input_file, yara):
     """ Input links from file and extract them into terminal.
 
     :param input_file: String: File name of links file.
+    :param yara: Boolean:
     :return: None
     """
     try:
         with open(input_file, 'r') as file:
             for line in file:
-                print((urllib.request.urlopen(line).read()))
-    except (HTTPError, URLError) as err:
-        print(f"HTTPError: {err}")
+                content = urllib.request.urlopen(line).read()
+                if yara:
+                    full_match_keywords = check_yara(raw=text(
+                        response=content).lower())
+
+                    if len(full_match_keywords) == 0:
+                        print(f"No matched in: {line}")
+                print(content)
+    except (HTTPError, URLError, InvalidURL) as err:
+        print(f"Request Error: {err}")
     except IOError as err:
-        # error = sys.exc_info()[0]
         print(f"Error: {err}\n## Not valid file")
 
 
-def outex(website, output_file, out_path):
+def outex(website, output_file, out_path, yara):
     """ Scrapes the contents of the provided web address and outputs the
     contents to file.
 
     :param website: String: Url of web address to scrape.
     :param output_file: String: Filename of the results.
     :param out_path: String: Folder name of the output findings.
+    :param yara: Boolean:
     :return: None
     """
     # Extract page to file
     try:
         output_file = out_path + "/" + output_file
+        content = urllib.request.urlopen(website).read()
+
+        if yara:
+            full_match_keywords = check_yara(raw=text(
+                response=content).lower())
+
+            if len(full_match_keywords) == 0:
+                print(f"No matched in: {website}")
+
         with open(output_file, 'wb') as file:
-            file.write(urllib.request.urlopen(website).read())
+            file.write(content)
         print(f"## File created on: {os.getcwd()}/{output_file}")
     except (HTTPError, URLError) as err:
         print(f"HTTPError: {err}")
@@ -90,20 +159,31 @@ def outex(website, output_file, out_path):
         print(f"Error: {err}\n Can't write on file: {output_file}")
 
 
-def termex(website):
+def termex(website, yara):
     """ Scrapes provided web address and prints the results to the terminal.
 
     :param website: String: URL of website to scrape.
+    :param yara: Boolean:
     :return: None
     """
     try:
-        print((urllib.request.urlopen(website).read()))
+        content = urllib.request.urlopen(website).read()
+        if yara:
+            full_match_keywords = check_yara(raw=text(
+                response=content).lower())
+
+            if len(full_match_keywords) == 0:
+                # No match.
+                print(f"No matched in: {website}")
+                return
+
+        print(content)
     except (urllib.error.HTTPError, urllib.error.URLError) as err:
         print(f"Error: ({err}) {website}")
         return
 
 
-def extractor(website, crawl, output_file, input_file, out_path):
+def extractor(website, crawl, output_file, input_file, out_path, yara):
     """ Extractor - scrapes the resulting website or discovered links.
 
     :param website: String: URL of website to scrape.
@@ -112,19 +192,20 @@ def extractor(website, crawl, output_file, input_file, out_path):
     :param output_file: String: Filename of resulting output from scrape.
     :param input_file: String: Filename of crawled/discovered URLs
     :param out_path: String: Dir path for output files.
+    :yara: Boolean:
     :return: None
     """
     # TODO: Return output to torcrawl.py
     if len(input_file) > 0:
         if crawl:
-            cinex(input_file, out_path)
+            cinex(input_file, out_path, yara)
         # TODO: Extract from list into a folder
         # elif len(output_file) > 0:
         # 	inoutex(website, input_ile, output_file)
         else:
-            intermex(input_file)
+            intermex(input_file, yara)
     else:
         if len(output_file) > 0:
-            outex(website, output_file, out_path)
+            outex(website, output_file, out_path, yara)
         else:
-            termex(website)
+            termex(website, yara)
diff --git a/requirements.txt b/requirements.txt
index 53c3735..f61f800 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 pysocks
 beautifulsoup4>=4.7.1
 requests>=2.21.0
+yara
\ No newline at end of file
diff --git a/torcrawl.py b/torcrawl.py
index a7a0157..99b2cff 100755
--- a/torcrawl.py
+++ b/torcrawl.py
@@ -155,6 +155,13 @@ def main():
         '--folder',
         help='The root directory which will contain the generated files'
     )
+    parser.add_argument(
+        '-y',
+        '--yara',
+        action='store_true',
+        help='Check for keywords and only scrape documents that contain a '
+             'match.'
+    )
 
     args = parser.parse_args()
 
@@ -194,9 +201,11 @@ def main():
         print(f"## File created on {os.getcwd()}/{out_path}/links.txt")
         if args.extract:
             input_file = out_path + "/links.txt"
-            extractor(website, args.crawl, output_file, input_file, out_path)
+            extractor(website, args.crawl, output_file, input_file, out_path,
+                      args.yara)
     else:
-        extractor(website, args.crawl, output_file, input_file, out_path)
+        extractor(website, args.crawl, output_file, input_file, out_path,
+                  args.yara)
 
 
 # Stub to call main method.

From 5e3a7e6b2a853494ccf46d868f9be918200a8c40 Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Mon, 21 Mar 2022 23:57:27 +0000
Subject: [PATCH 2/7] Implement Yara Keyword Search

- Refactors check_yara method to remove category checking. Future Feature
- Update main application docstring.

See Issue: #14
---
 modules/extractor.py |  4 +---
 res/keywords.yar     | 33 +++++++++++++++++++++++++++++++++
 torcrawl.py          |  2 ++
 3 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 res/keywords.yar

diff --git a/modules/extractor.py b/modules/extractor.py
index 299ef4b..9c7178d 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -14,14 +14,12 @@
 from bs4 import BeautifulSoup
 
 
-def check_yara(raw=None, cat=False):
+def check_yara(raw=None):
     """
         Validates Yara Rule to categorize the site and check for keywords.
     """
 
     file_path = os.path.join('res/keywords.yar')
-    if cat:
-        file_path = os.path.join('res/categories.yar')
 
     if raw is not None:
         file = os.path.join(file_path)
diff --git a/res/keywords.yar b/res/keywords.yar
new file mode 100644
index 0000000..c16454b
--- /dev/null
+++ b/res/keywords.yar
@@ -0,0 +1,33 @@
+/*
+    Yara.
+*/
+
+/*
+    rule email_filter
+    {
+        meta:
+            author = "@the-siegfried"
+            score = 20
+        strings:
+              $email_add = /\b[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)*\.[a-zA-Z-]+[\w-]\b/
+        condition:
+            any of them
+
+    }
+*/
+
+rule keyword_search
+{
+    meta:
+        author = "@the-siegfried"
+        score = 90
+
+    strings:
+        $a = "Keyword1" fullword wide ascii nocase
+        $b = "Keyword Two" wide ascii nocase
+        $c = "kw 3" asci
+        $d = "KEYWORD four" nocase
+
+    condition:
+        any of them
+}
\ No newline at end of file
diff --git a/torcrawl.py b/torcrawl.py
index 99b2cff..1ba392c 100755
--- a/torcrawl.py
+++ b/torcrawl.py
@@ -20,6 +20,8 @@
                           (Defualt: terminal)
 -i, --input filename    : Input file with URL(s) (seperated by line)
 -o, --output [filename] : Output page(s) to file(s) (for one page)
+-y, --yara              : Yara keyword search page categorisation
+                        read in from /res folder.
 
 Crawl:
 -c, --crawl       : Crawl website (Default output on /links.txt)

From 5c69418320ac00b3549a4bf0bf92339606e47beb Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Tue, 22 Mar 2022 00:01:21 +0000
Subject: [PATCH 3/7] Implement Yara Keyword Search

- Removes commented out code.
- Amends absolute module calls.

See Issue: #14
---
 modules/extractor.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/modules/extractor.py b/modules/extractor.py
index 9c7178d..8b7e859 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -54,7 +54,6 @@ def cinex(input_file, out_path, yara):
     try:
         file = open(input_file, 'r')
     except IOError as err:
-        # error = sys.exc_info()[0]
         print(f"Error: {err}\n## Can't open: {input_file}")
 
     for line in file:
@@ -150,10 +149,9 @@ def outex(website, output_file, out_path, yara):
         with open(output_file, 'wb') as file:
             file.write(content)
         print(f"## File created on: {os.getcwd()}/{output_file}")
-    except (HTTPError, URLError) as err:
+    except (HTTPError, URLError, InvalidURL) as err:
         print(f"HTTPError: {err}")
     except IOError as err:
-        # error = sys.exc_info()[0]
         print(f"Error: {err}\n Can't write on file: {output_file}")
 
 
@@ -176,7 +174,7 @@ def termex(website, yara):
                 return
 
         print(content)
-    except (urllib.error.HTTPError, urllib.error.URLError) as err:
+    except (HTTPError, URLError, InvalidURL) as err:
         print(f"Error: ({err}) {website}")
         return
 

From 0d373cc9c3b76371d08e84d8fe6c3a2d45c53c22 Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Wed, 23 Mar 2022 22:38:32 +0000
Subject: [PATCH 4/7] Implement Yara Keyword Search

- Updates requirements.txt to include lxml.
- Ammends crawl method in crawler.py to dismiss None values.

See Issue: #14
---
 modules/crawler.py | 6 ++++--
 requirements.txt   | 3 ++-
 res/keywords.yar   | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/modules/crawler.py b/modules/crawler.py
index 6594cf8..78b56a4 100644
--- a/modules/crawler.py
+++ b/modules/crawler.py
@@ -136,7 +136,8 @@ def crawl(self):
                         continue
 
                     ver_link = self.canonical(link)
-                    lst.add(ver_link)
+                    if ver_link is not None:
+                        lst.add(ver_link)
 
                 # For each <area> tag.
                 for link in soup.findAll('area'):
@@ -146,7 +147,8 @@ def crawl(self):
                         continue
 
                     ver_link = self.canonical(link)
-                    lst.add(ver_link)
+                    if ver_link is not None:
+                        lst.add(ver_link)
 
                 # TODO: For images
                 # TODO: For scripts
diff --git a/requirements.txt b/requirements.txt
index f61f800..4b4d31e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 pysocks
 beautifulsoup4>=4.7.1
 requests>=2.21.0
-yara
\ No newline at end of file
+yara
+lxml
\ No newline at end of file
diff --git a/res/keywords.yar b/res/keywords.yar
index c16454b..d52bcf3 100644
--- a/res/keywords.yar
+++ b/res/keywords.yar
@@ -25,8 +25,9 @@ rule keyword_search
     strings:
         $a = "Keyword1" fullword wide ascii nocase
         $b = "Keyword Two" wide ascii nocase
-        $c = "kw 3" asci
+        $c = "kw 3" ascii
         $d = "KEYWORD four" nocase
+        $e = "necessarily" nocase
 
     condition:
         any of them

From 5e2733503ca25cf653c834433fa0105ee8f4c8ab Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Thu, 24 Mar 2022 19:11:44 +0000
Subject: [PATCH 5/7] Implement Yara Keyword Search

- Ammeds torcrawl.py to support -y argument accepting a value.
- Adds conditional handling for unexpected arguements for option '-y'.
- Refactors module/extractor.py to perform content parsing within check_yara method based on -y argument.
- Updates README.md to provide instructions for '-y/--yara' argument use.
- Updates res/keywords.yar to support README.md examples.

See Issue: #14
---
 README.md            | 27 ++++++++++++++++++++++
 modules/extractor.py | 53 ++++++++++++++++++++++----------------------
 res/keywords.yar     |  2 +-
 torcrawl.py          | 13 ++++++-----
 4 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 0aac161..3003df8 100644
--- a/README.md
+++ b/README.md
@@ -66,6 +66,7 @@ arg | Long | Description
 -e  |--extract| Extract page's code to terminal or file. (Default: Terminal)
 -i  |--input filename| Input file with URL(s) (seperated by line)
 -o  |--output [filename]| Output page(s) to file(s) (for one page)
+-y  |--yara | Perform yara keyword search (0 = search entire html object. 1 = search only text). 
 **Crawl**: | |
 -c  |--crawl| Crawl website (Default output on /links.txt)
 -d  |--cdepth| Set depth of crawl's travel (Default: 1)
@@ -98,6 +99,14 @@ $ python torcrawl.py -u http://www.github.com | grep 'google-analytics'
     <meta name="google-analytics" content="UA-*******-*">
 ```
 
+Extract to file and find only the line with google-analytics using yara:
+```shell
+$ python torcrawl.py -v -w -u https://github.com -e -y 0
+...
+```
+**_Note:_** update res/keyword.yar to search for other keywords.
+Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.
+
 Extract a set of webpages (imported from file) to terminal:
 
 ```shell
@@ -156,6 +165,24 @@ $ python torcrawl.py -u http://www.github.com/ -c -e | grep '</html>'
 ...
 ```
 
+### As Both + Keyword Search:
+You can crawl a page, perform a keyword search and extract the webpages that match the findings into a folder with a single command:
+
+```shell
+$ python torcrawl.py -v -u http://www.github.com/ -c -d 2 -p 5 -e -y 0
+## TOR is ready!
+## URL: http://www.github.com/
+## Your IP: *.*.*.*
+## Crawler Started from http://www.github.com with step 1 and wait 5
+## Step 1 completed with: 11 results
+## File created on /script/path/FolderName/index.htm
+## File created on /script/path/FolderName/projects.html
+## ...
+```
+
+***Note:*** *Update res/keyword.yar to search for other keywords.
+Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.*
+
 ## Demo:
 ![peek 2018-12-08 16-11](https://user-images.githubusercontent.com/9204902/49687660-f72f8280-fb0e-11e8-981e-1bbeeac398cc.gif)
 
diff --git a/modules/extractor.py b/modules/extractor.py
index 8b7e859..1c0fd45 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -14,7 +14,19 @@
 from bs4 import BeautifulSoup
 
 
-def check_yara(raw=None):
+def text(response=None):
+    """
+        Removes all the garbage from the HTML and takes only text elements
+        from the page.
+    """
+    soup = BeautifulSoup(response, features="lxml")
+    for s in soup(['script', 'style']):
+        s.decompose()
+
+    return ' '.join(soup.stripped_strings)
+
+
+def check_yara(raw=None, yara=0):
     """
         Validates Yara Rule to categorize the site and check for keywords.
     """
@@ -22,25 +34,18 @@ def check_yara(raw=None):
     file_path = os.path.join('res/keywords.yar')
 
     if raw is not None:
+        if yara == 1:
+            raw = text(response=raw).lower()
+
         file = os.path.join(file_path)
         rules = _yara.compile(file)
         matches = rules.match(data=raw)
+        if len(matches) != 0:
+            print("found a match!")
         return matches
 
 
-def text(response=None):
-    """
-        Removes all the garbage from the HTML and takes only text elements
-        from the page.
-    """
-    soup = BeautifulSoup(response, features="lxml")
-    for s in soup(['script', 'style']):
-        s.decompose()
-
-    return ' '.join(soup.stripped_strings)
-
-
-def cinex(input_file, out_path, yara):
+def cinex(input_file, out_path, yara=None):
     """ Ingests the crawled links from the input_file,
     scrapes the contents of the resulting web pages and writes the contents to
     the into out_path/{url_address}.
@@ -75,9 +80,8 @@ def cinex(input_file, out_path, yara):
         try:
             content = urllib.request.urlopen(line, timeout=10).read()
 
-            if yara:
-                full_match_keywords = check_yara(raw=text(
-                    response=content).lower())
+            if yara is not None:
+                full_match_keywords = check_yara(content, yara)
 
                 if len(full_match_keywords) == 0:
                     print('No matches found.')
@@ -111,9 +115,8 @@ def intermex(input_file, yara):
         with open(input_file, 'r') as file:
             for line in file:
                 content = urllib.request.urlopen(line).read()
-                if yara:
-                    full_match_keywords = check_yara(raw=text(
-                        response=content).lower())
+                if yara is not None:
+                    full_match_keywords = check_yara(raw=content, yara=yara)
 
                     if len(full_match_keywords) == 0:
                         print(f"No matched in: {line}")
@@ -139,9 +142,8 @@ def outex(website, output_file, out_path, yara):
         output_file = out_path + "/" + output_file
         content = urllib.request.urlopen(website).read()
 
-        if yara:
-            full_match_keywords = check_yara(raw=text(
-                response=content).lower())
+        if yara is not None:
+            full_match_keywords = check_yara(raw=content, yara=yara)
 
             if len(full_match_keywords) == 0:
                 print(f"No matched in: {website}")
@@ -164,9 +166,8 @@ def termex(website, yara):
     """
     try:
         content = urllib.request.urlopen(website).read()
-        if yara:
-            full_match_keywords = check_yara(raw=text(
-                response=content).lower())
+        if yara is not None:
+            full_match_keywords = check_yara(content, yara)
 
             if len(full_match_keywords) == 0:
                 # No match.
diff --git a/res/keywords.yar b/res/keywords.yar
index d52bcf3..379f51e 100644
--- a/res/keywords.yar
+++ b/res/keywords.yar
@@ -27,7 +27,7 @@ rule keyword_search
         $b = "Keyword Two" wide ascii nocase
         $c = "kw 3" ascii
         $d = "KEYWORD four" nocase
-        $e = "necessarily" nocase
+        $e = "google-" nocase
 
     condition:
         any of them
diff --git a/torcrawl.py b/torcrawl.py
index 1ba392c..2fdf6d1 100755
--- a/torcrawl.py
+++ b/torcrawl.py
@@ -21,7 +21,8 @@
 -i, --input filename    : Input file with URL(s) (seperated by line)
 -o, --output [filename] : Output page(s) to file(s) (for one page)
 -y, --yara              : Yara keyword search page categorisation
-                        read in from /res folder.
+                        read in from /res folder. 0 search whole html object.
+                        1 search only the text.
 
 Crawl:
 -c, --crawl       : Crawl website (Default output on /links.txt)
@@ -160,9 +161,8 @@ def main():
     parser.add_argument(
         '-y',
         '--yara',
-        action='store_true',
         help='Check for keywords and only scrape documents that contain a '
-             'match.'
+             'match. 0 search whole html object. 1 search only the text.'
     )
 
     args = parser.parse_args()
@@ -173,6 +173,9 @@ def main():
     c_depth = args.cdepth if args.cdepth else 0
     c_pause = args.cpause if args.cpause else 1
 
+    if int(args.yara) not in [0, 1]:
+        parser.error("argument -y/--yara: expected argument 0 or 1.")
+
     # Connect to TOR
     if args.without is False:
         check_tor(args.verbose)
@@ -204,10 +207,10 @@ def main():
         if args.extract:
             input_file = out_path + "/links.txt"
             extractor(website, args.crawl, output_file, input_file, out_path,
-                      args.yara)
+                      int(args.yara))
     else:
         extractor(website, args.crawl, output_file, input_file, out_path,
-                  args.yara)
+                  int(args.yara))
 
 
 # Stub to call main method.

From e86c4d1dc5464d6837d871a6f43a355c342affc0 Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Thu, 24 Mar 2022 22:44:53 +0000
Subject: [PATCH 6/7] Implement Yara Keyword Search

- Updates docstrings in modules/extractor.py

See Issue: #14
---
 modules/extractor.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/modules/extractor.py b/modules/extractor.py
index 1c0fd45..394f6d8 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -1,7 +1,6 @@
 #!/usr/bin/python
 import io
 import os
-import sys
 import yara as _yara
 import urllib.error
 import urllib.parse
@@ -15,9 +14,11 @@
 
 
 def text(response=None):
-    """
-        Removes all the garbage from the HTML and takes only text elements
-        from the page.
+    """ Removes all the garbage from the HTML and takes only text elements
+    from the page.
+
+    :param response: HTTP Response.
+    :return: String: Text only stripped response.
     """
     soup = BeautifulSoup(response, features="lxml")
     for s in soup(['script', 'style']):
@@ -27,8 +28,11 @@ def text(response=None):
 
 
 def check_yara(raw=None, yara=0):
-    """
-        Validates Yara Rule to categorize the site and check for keywords.
+    """ Validates Yara Rule to categorize the site and check for keywords.
+
+    :param raw: HTTP Response body.
+    :param yara:  Integer: Keyword search argument.
+    :return matches: List of yara rule matches.
     """
 
     file_path = os.path.join('res/keywords.yar')
@@ -52,7 +56,7 @@ def cinex(input_file, out_path, yara=None):
 
     :param input_file: String: Filename of the crawled Urls.
     :param out_path: String: Pathname of results.
-    :param yara: Boolean:
+    :param yara: Integer: Keyword search argument.
     :return: None
     """
     file = io.TextIOWrapper
@@ -108,7 +112,7 @@ def intermex(input_file, yara):
     """ Input links from file and extract them into terminal.
 
     :param input_file: String: File name of links file.
-    :param yara: Boolean:
+    :param yara: Integer: Keyword search argument.
     :return: None
     """
     try:
@@ -134,7 +138,7 @@ def outex(website, output_file, out_path, yara):
     :param website: String: Url of web address to scrape.
     :param output_file: String: Filename of the results.
     :param out_path: String: Folder name of the output findings.
-    :param yara: Boolean:
+    :param yara: Integer: Keyword search argument.
     :return: None
     """
     # Extract page to file
@@ -161,7 +165,7 @@ def termex(website, yara):
     """ Scrapes provided web address and prints the results to the terminal.
 
     :param website: String: URL of website to scrape.
-    :param yara: Boolean:
+    :param yara: Integer: Keyword search argument.
     :return: None
     """
     try:
@@ -189,7 +193,7 @@ def extractor(website, crawl, output_file, input_file, out_path, yara):
     :param output_file: String: Filename of resulting output from scrape.
     :param input_file: String: Filename of crawled/discovered URLs
     :param out_path: String: Dir path for output files.
-    :yara: Boolean:
+    :param yara: Integer: keyword search option.
     :return: None
     """
     # TODO: Return output to torcrawl.py

From 32d7b7a96a5051397f21a57accad37abb20644d2 Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Thu, 24 Mar 2022 22:47:31 +0000
Subject: [PATCH 7/7] Implement Yara Keyword Search

- Resolve gramatical mistake.

See Issue: #14
---
 modules/extractor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/extractor.py b/modules/extractor.py
index 394f6d8..d13310f 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -123,7 +123,7 @@ def intermex(input_file, yara):
                     full_match_keywords = check_yara(raw=content, yara=yara)
 
                     if len(full_match_keywords) == 0:
-                        print(f"No matched in: {line}")
+                        print(f"No matches in: {line}")
                 print(content)
     except (HTTPError, URLError, InvalidURL) as err:
         print(f"Request Error: {err}")
@@ -150,7 +150,7 @@ def outex(website, output_file, out_path, yara):
             full_match_keywords = check_yara(raw=content, yara=yara)
 
             if len(full_match_keywords) == 0:
-                print(f"No matched in: {website}")
+                print(f"No matches in: {website}")
 
         with open(output_file, 'wb') as file:
             file.write(content)
@@ -175,7 +175,7 @@ def termex(website, yara):
 
             if len(full_match_keywords) == 0:
                 # No match.
-                print(f"No matched in: {website}")
+                print(f"No matches in: {website}")
                 return
 
         print(content)