|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import asyncio |
| 4 | +import re |
| 5 | +from asyncio import Task |
| 6 | +from functools import partial |
| 7 | +from typing import Annotated, Any, Union |
| 8 | + |
| 9 | +from pydantic import BaseModel, Field, TypeAdapter |
| 10 | + |
| 11 | +from crawlee import Request |
| 12 | +from crawlee._types import HttpMethod |
| 13 | +from crawlee.http_clients import BaseHttpClient, HttpxHttpClient |
| 14 | +from crawlee.storages import RequestList as CrawleeRequestList |
| 15 | + |
| 16 | +from apify._utils import docs_group |
| 17 | + |
| 18 | +URL_NO_COMMAS_REGEX = re.compile( |
| 19 | + r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?' |
| 20 | +) |
| 21 | + |
| 22 | + |
| 23 | +class _RequestDetails(BaseModel): |
| 24 | + method: HttpMethod = 'GET' |
| 25 | + payload: str = '' |
| 26 | + headers: Annotated[dict[str, str], Field(default_factory=dict)] = {} |
| 27 | + user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {} |
| 28 | + |
| 29 | + |
| 30 | +class _RequestsFromUrlInput(_RequestDetails): |
| 31 | + requests_from_url: str = Field(alias='requestsFromUrl') |
| 32 | + |
| 33 | + |
| 34 | +class _SimpleUrlInput(_RequestDetails): |
| 35 | + url: str |
| 36 | + |
| 37 | + |
| 38 | +url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]]) |
| 39 | + |
| 40 | + |
| 41 | +@docs_group('Classes') |
| 42 | +class RequestList(CrawleeRequestList): |
| 43 | + """Extends crawlee RequestList. |
| 44 | +
|
| 45 | + Method open is used to create RequestList from actor's requestListSources input. |
| 46 | + """ |
| 47 | + |
| 48 | + @staticmethod |
| 49 | + async def open( |
| 50 | + name: str | None = None, |
| 51 | + request_list_sources_input: list[dict[str, Any]] | None = None, |
| 52 | + http_client: BaseHttpClient | None = None, |
| 53 | + ) -> RequestList: |
| 54 | + """Creates RequestList from Actor input requestListSources. |
| 55 | +
|
| 56 | + Args: |
| 57 | + name: Name of the returned RequestList. |
| 58 | + request_list_sources_input: List of dicts with either url key or requestsFromUrl key. |
| 59 | + http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys. |
| 60 | +
|
| 61 | + Returns: |
| 62 | + RequestList created from request_list_sources_input. |
| 63 | +
|
| 64 | + ### Usage |
| 65 | +
|
| 66 | + ```python |
| 67 | + example_input = [ |
| 68 | + # Gather urls from response body. |
| 69 | + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, |
| 70 | + # Directly include this url. |
| 71 | + {'url': 'https://crawlee.dev', 'method': 'GET'} |
| 72 | + ] |
| 73 | + request_list = await RequestList.open(request_list_sources_input=example_input) |
| 74 | + ``` |
| 75 | + """ |
| 76 | + request_list_sources_input = request_list_sources_input or [] |
| 77 | + return await RequestList._create_request_list(name, request_list_sources_input, http_client) |
| 78 | + |
| 79 | + @staticmethod |
| 80 | + async def _create_request_list( |
| 81 | + name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None |
| 82 | + ) -> RequestList: |
| 83 | + if not http_client: |
| 84 | + http_client = HttpxHttpClient() |
| 85 | + |
| 86 | + url_inputs = url_input_adapter.validate_python(request_list_sources_input) |
| 87 | + |
| 88 | + simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)] |
| 89 | + remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)] |
| 90 | + |
| 91 | + simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs) |
| 92 | + remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client) |
| 93 | + |
| 94 | + return RequestList(name=name, requests=simple_url_requests + remote_url_requests) |
| 95 | + |
| 96 | + @staticmethod |
| 97 | + def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]: |
| 98 | + return [ |
| 99 | + Request.from_url( |
| 100 | + method=request_input.method, |
| 101 | + url=request_input.url, |
| 102 | + payload=request_input.payload.encode('utf-8'), |
| 103 | + headers=request_input.headers, |
| 104 | + user_data=request_input.user_data, |
| 105 | + ) |
| 106 | + for request_input in simple_url_inputs |
| 107 | + ] |
| 108 | + |
| 109 | + @staticmethod |
| 110 | + async def _fetch_requests_from_url( |
| 111 | + remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient |
| 112 | + ) -> list[Request]: |
| 113 | + """Crete list of requests from url. |
| 114 | +
|
| 115 | + Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting |
| 116 | + callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from |
| 117 | + collected links and additional inputs stored in other attributes of each remote_url_requests_inputs. |
| 118 | + """ |
| 119 | + created_requests: list[Request] = [] |
| 120 | + |
| 121 | + def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: |
| 122 | + """Callback to scrape response body with regexp and create Requests from matches.""" |
| 123 | + matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) |
| 124 | + created_requests.extend( |
| 125 | + [ |
| 126 | + Request.from_url( |
| 127 | + match.group(0), |
| 128 | + method=request_input.method, |
| 129 | + payload=request_input.payload.encode('utf-8'), |
| 130 | + headers=request_input.headers, |
| 131 | + user_data=request_input.user_data, |
| 132 | + ) |
| 133 | + for match in matches |
| 134 | + ] |
| 135 | + ) |
| 136 | + |
| 137 | + remote_url_requests = [] |
| 138 | + for remote_url_requests_input in remote_url_requests_inputs: |
| 139 | + get_response_task = asyncio.create_task( |
| 140 | + http_client.send_request( |
| 141 | + method='GET', |
| 142 | + url=remote_url_requests_input.requests_from_url, |
| 143 | + ) |
| 144 | + ) |
| 145 | + |
| 146 | + get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) |
| 147 | + remote_url_requests.append(get_response_task) |
| 148 | + |
| 149 | + await asyncio.gather(*remote_url_requests) |
| 150 | + return created_requests |
0 commit comments