@@ -64,17 +64,37 @@ jobs:
64
64
'./**/*.html'
65
65
66
66
- name : Download Ultralytics Website
67
- # WARNING: Do not remove deprecated language directories from --exclude-directories list
68
67
if : matrix.branch == 'main'
69
68
run : |
69
+ # Download sitemap.xml
70
+ wget -O sitemap.xml https://www.ultralytics.com/sitemap.xml
71
+
72
+ # Parse URLs using a combination of tr, sed, and grep
73
+ tr '\n' ' ' < sitemap.xml | \
74
+ sed 's/<loc>/\n<loc>/g' | \
75
+ grep -oP '(?<=<loc>).*?(?=</loc>)' | \
76
+ sed 's/^[[:space:]]*//;s/[[:space:]]*$//' > urls.txt
77
+
78
+ # Count total URLs to be downloaded
79
+ total_urls=$(wc -l < urls.txt)
80
+ echo "Total URLs to be downloaded: $total_urls"
81
+
82
+ # Download all URLs in parallel
70
83
mkdir ultralytics_website
71
84
wget -P ultralytics_website \
72
- --recursive \
73
- --no-parent \
74
85
--adjust-extension \
75
86
--reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \
76
- --exclude-directories="/zh/,/ko/,/ja/,/ru/,/de/,/fr/,/es/,/pt/,/tr/,/vi/,/ar/,/it/,/nl/,/hi/" \
77
- https://www.ultralytics.com/ || true
87
+ --input-file=urls.txt \
88
+ --no-clobber \
89
+ --no-parent \
90
+ --wait=0.001 \
91
+ --random-wait \
92
+ --tries=3 \
93
+ --no-verbose
94
+
95
+ # Count successfully downloaded files
96
+ downloaded_files=$(find ultralytics_website -type f | wc -l)
97
+ echo "Total pages downloaded: $downloaded_files"
78
98
79
99
- name : Run Broken Link Checks on Ultralytics Website
80
100
if : matrix.branch == 'main'
0 commit comments