Improve Ultralytics website download (#96)

glenn-jocher · UltralyticsAssistant · web-flow · commit 7759e9d94efb · 2024-08-26T22:35:49.000+02:00
Co-authored-by: UltralyticsAssistant &lt;web@ultralytics.com&gt;
diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml
@@ -64,17 +64,37 @@ jobs:
             './**/*.html'
 
       - name: Download Ultralytics Website
-        # WARNING: Do not remove deprecated language directories from --exclude-directories list
         if: matrix.branch == 'main'
         run: |
+          # Download sitemap.xml
+          wget -O sitemap.xml https://www.ultralytics.com/sitemap.xml
+          
+          # Parse URLs using a combination of tr, sed, and grep
+          tr '\n' ' ' < sitemap.xml | \
+            sed 's/<loc>/\n<loc>/g' | \
+            grep -oP '(?<=<loc>).*?(?=</loc>)' | \
+            sed 's/^[[:space:]]*//;s/[[:space:]]*$//' > urls.txt
+          
+          # Count total URLs to be downloaded
+          total_urls=$(wc -l < urls.txt)
+          echo "Total URLs to be downloaded: $total_urls"
+          
+          # Download all URLs in parallel
           mkdir ultralytics_website
           wget -P ultralytics_website \
-               --recursive \
-               --no-parent \
                --adjust-extension \
                --reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \
-               --exclude-directories="/zh/,/ko/,/ja/,/ru/,/de/,/fr/,/es/,/pt/,/tr/,/vi/,/ar/,/it/,/nl/,/hi/" \
-               https://www.ultralytics.com/ || true
+               --input-file=urls.txt \
+               --no-clobber \
+               --no-parent \
+               --wait=0.001 \
+               --random-wait \
+               --tries=3 \
+               --no-verbose
+          
+          # Count successfully downloaded files
+          downloaded_files=$(find ultralytics_website -type f | wc -l)
+          echo "Total pages downloaded: $downloaded_files"
 
       - name: Run Broken Link Checks on Ultralytics Website
         if: matrix.branch == 'main'