psf · jasur-py · Jan 15, 2025 · Jan 18, 2025
diff --git a/README.rst b/README.rst
@@ -113,8 +113,6 @@ Render out an Element's HTML:
     >>> about.html
     '<li aria-haspopup="true" class="tier-1 element-1 " id="about">\n<a class="" href="/about/" title="">About</a>\n<ul aria-hidden="true" class="subnav menu" role="menu">\n<li class="tier-2 element-1" role="treeitem"><a href="/about/apps/" title="">Applications</a></li>\n<li class="tier-2 element-2" role="treeitem"><a href="/about/quotes/" title="">Quotes</a></li>\n<li class="tier-2 element-3" role="treeitem"><a href="/about/gettingstarted/" title="">Getting Started</a></li>\n<li class="tier-2 element-4" role="treeitem"><a href="/about/help/" title="">Help</a></li>\n<li class="tier-2 element-5" role="treeitem"><a href="http://brochure.getpython.info/" title="">Python Brochure</a></li>\n</ul>\n</li>'
 
-
-
 Select Elements within Elements:
 
 .. code-block:: pycon
@@ -129,7 +127,6 @@ Search for links within an element:
     >>> about.absolute_links
     {'http://brochure.getpython.info/', 'https://www.python.org/about/gettingstarted/', 'https://www.python.org/about/', 'https://www.python.org/about/quotes/', 'https://www.python.org/about/help/', 'https://www.python.org/about/apps/'}
 
-
 Search for text on the page:
 
 .. code-block:: pycon
@@ -144,7 +141,7 @@ More complex CSS Selector example (copied from Chrome dev tools):
     >>> r = session.get('https://github.com/')
     >>> sel = 'body > div.application-main > div.jumbotron.jumbotron-codelines > div > div > div.col-md-7.text-center.text-md-left > p'
     >>> print(r.html.find(sel, first=True).text)
-    GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers.
+    GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers.
 
 XPath is also supported:
 
@@ -244,6 +241,53 @@ You can also use this library without Requests:
     >>> html.links
     {'https://httpbin.org'}
 
+Structured Data Extraction
+========================
+
+Extract structured data from repeated HTML patterns:
+
+.. code-block:: pycon
+
+    >>> from requests_html import HTMLSession, ExtractorPattern
+    >>> session = HTMLSession()
+    >>> r = session.get('https://example.com/products')
+
+    >>> # Define extraction pattern
+    >>> pattern = ExtractorPattern(
+    ...     selector=".product-card",
+    ...     fields={
+    ...         "title": ".product-title",
+    ...         "price": ".price",
+    ...         "description": ".description"
+    ...     },
+    ...     required_fields=["title", "price"]
+    ... )
+
+    >>> # Extract structured data
+    >>> products = r.html.extract_structured_data(pattern)
+    >>> products[0]
+    {'title': 'Example Product', 'price': '$99.99', 'description': 'A great product'}
+
+You can also extract data from elements you've already selected:
+
+.. code-block:: pycon
+
+    >>> product_section = r.html.find('.products-section', first=True)
+    >>> products = product_section.extract_structured_data(pattern)
+
+The extractor supports required fields and will skip items missing those fields:
+
+.. code-block:: pycon
+
+    >>> pattern = ExtractorPattern(
+    ...     selector=".article",
+    ...     fields={
+    ...         "title": "h2",
+    ...         "date": ".published-date",
+    ...         "author": ".author-name"
+    ...     },
+    ...     required_fields=["title", "date"]  # Articles must have title and date
+    ... )
 
 Installation
 ============
@@ -253,4 +297,4 @@ Installation
     $ pipenv install requests-html
     ✨🍰✨
 
-Only **Python 3.6 and above** is supported.
+Only **Python 3.6 and above** is supported.
diff --git a/pytest.ini b/pytest.ini
@@ -1,4 +1,7 @@
 [pytest]
 markers =
     render: marks tests for html render
-    internet: marks tests which runs on internet pages
+    internet: marks tests which runs on internet pages
+asyncio_mode = strict
+python_files = test_*.py
+testpaths = tests
diff --git a/requests_html.py b/requests_html.py
@@ -4,7 +4,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures._base import TimeoutError
 from functools import partial
-from typing import Set, Union, List, MutableMapping, Optional
+from typing import Set, Union, List, MutableMapping, Optional, Dict
 
 import pyppeteer
 import requests
@@ -22,6 +22,9 @@
 from parse import findall, Result
 from w3lib.encoding import html_to_unicode
 
+from dataclasses import dataclass
+
+
 DEFAULT_ENCODING = 'utf-8'
 DEFAULT_URL = 'https://example.org/'
 DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8'
@@ -843,3 +846,76 @@ def run(self, *coros):
         ]
         done, _ = self.loop.run_until_complete(asyncio.wait(tasks))
         return [t.result() for t in done]
+
+
+@dataclass
+class ExtractorPattern:
+    selector: str
+    fields: Dict[str, str]
+    required_fields: Optional[List[str]] = None
+
+class StructuredExtractor:
+    """
+    A utility class to extract structured data from repeated HTML patterns.
+    This extends requests-html's capabilities for handling common web scraping patterns.
+    """
+
+    def __init__(self, html: HTML):
+        self.html = html
+
+    def extract_structured_data(
+        self,
+        pattern: ExtractorPattern,
+        limit: Optional[int] = None
+    ) -> List[Dict[str, str]]:
+        """
+        Extracts structured data from HTML based on defined patterns.
+
+        Args:
+            pattern: ExtractorPattern defining the selection rules
+            limit: Optional maximum number of items to extract
+
+        Returns:
+            List of dictionaries containing the extracted data
+        """
+        results = []
+        elements = self.html.find(pattern.selector, first=False)
+
+        if limit:
+            elements = elements[:limit]
+
+        for element in elements:
+            item_data = {}
+            is_valid = True
+
+            for field_name, field_selector in pattern.fields.items():
+                field_element = element.find(field_selector, first=True)
+
+                if field_element:
+                    item_data[field_name] = field_element.text.strip()
+                elif pattern.required_fields and field_name in pattern.required_fields:
+                    is_valid = False
+                    break
+                else:
+                    item_data[field_name] = ""
+
+            if is_valid:
+                results.append(item_data)
+
+        return results
+
+    @classmethod
+    def from_url(cls, url: str) -> 'StructuredExtractor':
+        """
+        Creates a StructuredExtractor instance from a URL.
+
+        Args:
+            url: The URL to fetch and parse
+
+        Returns:
+            StructuredExtractor instance
+        """
+        session = HTMLSession()
+        r = session.get(url)
+        return cls(r.html)
+
diff --git a/tests/test_structured_extractor.py b/tests/test_structured_extractor.py
@@ -0,0 +1,154 @@
+import pytest
+from requests_html import HTML, StructuredExtractor, ExtractorPattern
+
+@pytest.fixture
+def sample_html():
+    return HTML(html='''
+        <div class="container">
+            <div class="product-card">
+                <h2 class="product-title">iPhone 14</h2>
+                <span class="price">$999</span>
+                <p class="description">Latest iPhone model</p>
+            </div>
+            <div class="product-card">
+                <h2 class="product-title">Samsung Galaxy S23</h2>
+                <span class="price">$899</span>
+                <p class="description">Flagship Android phone</p>
+            </div>
+            <div class="product-card">
+                <h2 class="product-title">Google Pixel 7</h2>
+                <!-- Missing price -->
+                <p class="description">Google's flagship phone</p>
+            </div>
+        </div>
+    ''')
+
+@pytest.fixture
+def basic_pattern():
+    return ExtractorPattern(
+        selector=".product-card",
+        fields={
+            "title": ".product-title",
+            "price": ".price",
+            "description": ".description"
+        }
+    )
+
+@pytest.fixture
+def pattern_with_required():
+    return ExtractorPattern(
+        selector=".product-card",
+        fields={
+            "title": ".product-title",
+            "price": ".price",
+            "description": ".description"
+        },
+        required_fields=["title", "price"]
+    )
+
+def test_extractor_initialization(sample_html):
+    extractor = StructuredExtractor(sample_html)
+    assert extractor.html == sample_html
+
+def test_basic_extraction(sample_html, basic_pattern):
+    extractor = StructuredExtractor(sample_html)
+    results = extractor.extract_structured_data(basic_pattern)
+
+    assert len(results) == 3
+    assert results[0]["title"] == "iPhone 14"
+    assert results[0]["price"] == "$999"
+    assert results[0]["description"] == "Latest iPhone model"
+
+def test_extraction_with_required_fields(sample_html, pattern_with_required):
+    extractor = StructuredExtractor(sample_html)
+    results = extractor.extract_structured_data(pattern_with_required)
+
+    # Should only return 2 items since the third is missing the required price
+    assert len(results) == 2
+    assert all("price" in item for item in results)
+
+def test_extraction_with_limit(sample_html, basic_pattern):
+    extractor = StructuredExtractor(sample_html)
+    results = extractor.extract_structured_data(basic_pattern, limit=1)
+
+    assert len(results) == 1
+    assert results[0]["title"] == "iPhone 14"
+
+def test_missing_optional_field(sample_html):
+    pattern = ExtractorPattern(
+        selector=".product-card",
+        fields={
+            "title": ".product-title",
+            "nonexistent": ".nonexistent-class"
+        }
+    )
+
+    extractor = StructuredExtractor(sample_html)
+    results = extractor.extract_structured_data(pattern)
+
+    assert len(results) == 3
+    assert all(item["nonexistent"] == "" for item in results)
+
+def test_invalid_selector(sample_html):
+    pattern = ExtractorPattern(
+        selector=".nonexistent-container",
+        fields={
+            "title": ".product-title"
+        }
+    )
+
+    extractor = StructuredExtractor(sample_html)
+    results = extractor.extract_structured_data(pattern)
+
+    assert len(results) == 0
+
+def test_empty_html():
+    """Test extraction with empty HTML"""
+    html = HTML(html='<div></div>')
+    extractor = StructuredExtractor(html)
+
+    pattern = ExtractorPattern(
+        selector=".product-card",
+        fields={
+            "title": ".product-title"
+        }
+    )
+
+    results = extractor.extract_structured_data(pattern)
+    assert len(results) == 0
+
+def test_pattern_without_required_fields():
+    """Test pattern initialization without required fields"""
+    pattern = ExtractorPattern(
+        selector=".product-card",
+        fields={
+            "title": ".product-title"
+        }
+    )
+    assert pattern.required_fields is None
+
+def test_from_url(requests_mock):
+    """Test creating extractor from URL"""
+    html_content = '''
+        <div class="product-card">
+            <h2 class="product-title">Test Product</h2>
+            <span class="price">$100</span>
+        </div>
+    '''
+    requests_mock.get("https://example.com", text=html_content)
+
+    extractor = StructuredExtractor.from_url("https://example.com")
+    assert isinstance(extractor, StructuredExtractor)
+
+    pattern = ExtractorPattern(
+        selector=".product-card",
+        fields={
+            "title": ".product-title",
+            "price": ".price"
+        }
+    )
+
+    results = extractor.extract_structured_data(pattern)
+    assert len(results) == 1
+    assert results[0]["title"] == "Test Product"
+    assert results[0]["price"] == "$100"