Skip to content

Commit c14fb9a

Browse files
Pijukateljanbuchar
andauthored
feat: Handle request list user input (#326)
Add helper function to handle request list inputs. Closes: #310 --------- Co-authored-by: Jan Buchar <[email protected]>
1 parent 777637a commit c14fb9a

File tree

12 files changed

+387
-10
lines changed

12 files changed

+387
-10
lines changed

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ indent-style = "space"
141141
docstring-quotes = "double"
142142
inline-quotes = "single"
143143

144+
[tool.ruff.lint.flake8-type-checking]
145+
runtime-evaluated-base-classes = ["pydantic.BaseModel", "crawlee.configuration.Configuration"]
146+
144147
[tool.ruff.lint.flake8-builtins]
145148
builtins-ignorelist = ["id"]
146149

src/apify/_actor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from lazy_object_proxy import Proxy
1010
from pydantic import AliasChoices
11-
from typing_extensions import Self
1211

1312
from apify_client import ApifyClientAsync
1413
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
@@ -31,6 +30,8 @@
3130
import logging
3231
from types import TracebackType
3332

33+
from typing_extensions import Self
34+
3435
from crawlee.proxy_configuration import _NewUrlFunction
3536

3637
from apify._models import Webhook

src/apify/_configuration.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work)
21
from __future__ import annotations
32

43
from datetime import datetime, timedelta

src/apify/_models.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# ruff: noqa: TCH001 TCH002 TCH003 (Pydantic)
21
from __future__ import annotations
32

43
from datetime import datetime, timedelta

src/apify/_platform_event_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
import asyncio
4-
from datetime import datetime # noqa: TCH003
4+
from datetime import datetime
55
from typing import TYPE_CHECKING, Annotated, Any, Literal, Union
66

77
import websockets.client

src/apify/scrapy/middlewares/apify_proxy.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING
34
from urllib.parse import ParseResult, urlparse
45

56
try:
6-
from scrapy import Request, Spider # noqa: TCH002
7+
if TYPE_CHECKING:
8+
from scrapy import Request, Spider
9+
from scrapy.crawler import Crawler
710
from scrapy.core.downloader.handlers.http11 import TunnelError
8-
from scrapy.crawler import Crawler # noqa: TCH002
911
from scrapy.exceptions import NotConfigured
1012
except ImportError as exc:
1113
raise ImportError(

src/apify/scrapy/pipelines/actor_dataset_push.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING
4+
35
from itemadapter.adapter import ItemAdapter
46

57
try:
6-
from scrapy import Item, Spider # noqa: TCH002
8+
if TYPE_CHECKING:
9+
from scrapy import Item, Spider
710
except ImportError as exc:
811
raise ImportError(
912
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',

src/apify/scrapy/scheduler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
from __future__ import annotations
22

33
import traceback
4+
from typing import TYPE_CHECKING
45

56
from apify._configuration import Configuration
67
from apify.apify_storage_client import ApifyStorageClient
78

89
try:
910
from scrapy import Spider
1011
from scrapy.core.scheduler import BaseScheduler
11-
from scrapy.http.request import Request # noqa: TCH002
1212
from scrapy.utils.reactor import is_asyncio_reactor_installed
13+
14+
if TYPE_CHECKING:
15+
from scrapy.http.request import Request
1316
except ImportError as exc:
1417
raise ImportError(
1518
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',

src/apify/scrapy/utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,17 @@
22

33
import asyncio
44
from base64 import b64encode
5+
from typing import TYPE_CHECKING
56
from urllib.parse import unquote
67

78
from apify_shared.utils import ignore_docs
89

910
try:
10-
from scrapy.settings import Settings # noqa: TCH002
1111
from scrapy.utils.project import get_project_settings
1212
from scrapy.utils.python import to_bytes
13+
14+
if TYPE_CHECKING:
15+
from scrapy.settings import Settings
1316
except ImportError as exc:
1417
raise ImportError(
1518
'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '

src/apify/storages/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
22

3-
__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue']
3+
from ._request_list import RequestList
4+
5+
__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'RequestList']

src/apify/storages/_request_list.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import re
5+
from asyncio import Task
6+
from functools import partial
7+
from typing import Annotated, Any, Union
8+
9+
from pydantic import BaseModel, Field, TypeAdapter
10+
11+
from crawlee import Request
12+
from crawlee._types import HttpMethod
13+
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
14+
from crawlee.storages import RequestList as CrawleeRequestList
15+
16+
from apify._utils import docs_group
17+
18+
URL_NO_COMMAS_REGEX = re.compile(
19+
r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
20+
)
21+
22+
23+
class _RequestDetails(BaseModel):
24+
method: HttpMethod = 'GET'
25+
payload: str = ''
26+
headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
27+
user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}
28+
29+
30+
class _RequestsFromUrlInput(_RequestDetails):
31+
requests_from_url: str = Field(alias='requestsFromUrl')
32+
33+
34+
class _SimpleUrlInput(_RequestDetails):
35+
url: str
36+
37+
38+
url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
39+
40+
41+
@docs_group('Classes')
42+
class RequestList(CrawleeRequestList):
43+
"""Extends crawlee RequestList.
44+
45+
Method open is used to create RequestList from actor's requestListSources input.
46+
"""
47+
48+
@staticmethod
49+
async def open(
50+
name: str | None = None,
51+
request_list_sources_input: list[dict[str, Any]] | None = None,
52+
http_client: BaseHttpClient | None = None,
53+
) -> RequestList:
54+
"""Creates RequestList from Actor input requestListSources.
55+
56+
Args:
57+
name: Name of the returned RequestList.
58+
request_list_sources_input: List of dicts with either url key or requestsFromUrl key.
59+
http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys.
60+
61+
Returns:
62+
RequestList created from request_list_sources_input.
63+
64+
### Usage
65+
66+
```python
67+
example_input = [
68+
# Gather urls from response body.
69+
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
70+
# Directly include this url.
71+
{'url': 'https://crawlee.dev', 'method': 'GET'}
72+
]
73+
request_list = await RequestList.open(request_list_sources_input=example_input)
74+
```
75+
"""
76+
request_list_sources_input = request_list_sources_input or []
77+
return await RequestList._create_request_list(name, request_list_sources_input, http_client)
78+
79+
@staticmethod
80+
async def _create_request_list(
81+
name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None
82+
) -> RequestList:
83+
if not http_client:
84+
http_client = HttpxHttpClient()
85+
86+
url_inputs = url_input_adapter.validate_python(request_list_sources_input)
87+
88+
simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
89+
remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
90+
91+
simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
92+
remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
93+
94+
return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
95+
96+
@staticmethod
97+
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
98+
return [
99+
Request.from_url(
100+
method=request_input.method,
101+
url=request_input.url,
102+
payload=request_input.payload.encode('utf-8'),
103+
headers=request_input.headers,
104+
user_data=request_input.user_data,
105+
)
106+
for request_input in simple_url_inputs
107+
]
108+
109+
@staticmethod
110+
async def _fetch_requests_from_url(
111+
remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
112+
) -> list[Request]:
113+
"""Crete list of requests from url.
114+
115+
Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
116+
callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
117+
collected links and additional inputs stored in other attributes of each remote_url_requests_inputs.
118+
"""
119+
created_requests: list[Request] = []
120+
121+
def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
122+
"""Callback to scrape response body with regexp and create Requests from matches."""
123+
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
124+
created_requests.extend(
125+
[
126+
Request.from_url(
127+
match.group(0),
128+
method=request_input.method,
129+
payload=request_input.payload.encode('utf-8'),
130+
headers=request_input.headers,
131+
user_data=request_input.user_data,
132+
)
133+
for match in matches
134+
]
135+
)
136+
137+
remote_url_requests = []
138+
for remote_url_requests_input in remote_url_requests_inputs:
139+
get_response_task = asyncio.create_task(
140+
http_client.send_request(
141+
method='GET',
142+
url=remote_url_requests_input.requests_from_url,
143+
)
144+
)
145+
146+
get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
147+
remote_url_requests.append(get_response_task)
148+
149+
await asyncio.gather(*remote_url_requests)
150+
return created_requests

0 commit comments

Comments
 (0)