|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +set -euo pipefail |
| 4 | + |
| 5 | +check_sum() { |
| 6 | + LINE=$(grep $1 $2) |
| 7 | + # Remove the name of the file from the checksum line. |
| 8 | + # Alternatively, we could append it to the `sum` output OR remove the extra spaces from it: |
| 9 | + # [[ $(echo $(sum $1)) != $(grep $1 $2 | awk '{print $1" "$2}' -) ]] |
| 10 | + if [[ $(sum $1) != ${LINE% $1} ]]; then |
| 11 | + >&2 echo "Checksum mismatch for $1" |
| 12 | + exit 1 |
| 13 | + fi |
| 14 | +} |
| 15 | + |
| 16 | +ENSEMBL="$1" |
| 17 | +SPECIES="$2" |
| 18 | +ASSEMBLY="$3" |
| 19 | + |
| 20 | +# saccharomyces_cerevisiae only has toplevel |
| 21 | + |
| 22 | +# Ensembl has configured its HTTPS server incorrectly, missing an intermediate certificate in the chain |
| 23 | +# This is a workaround that still verifies the certificate chain, but it would still permit an attack |
| 24 | +# from whatismychaincert.com or somebody compromising that website. |
| 25 | +curl --output ensembl.org.chained.crt 'https://whatsmychaincert.com/generate?include_leaf=1;host=ftp.ensembl.org' |
| 26 | +# URLs are case-sensitive. Directories are lowercase, files are mixed. |
| 27 | +curl --cacert ensembl.org.chained.crt --fail-early --fail \ |
| 28 | + --output fasta.sum https://ftp.ensembl.org/pub/release-$ENSEMBL/fasta/${SPECIES,,}/dna/CHECKSUMS \ |
| 29 | + --remote-name https://ftp.ensembl.org/pub/release-$ENSEMBL/fasta/${SPECIES,,}/dna/$SPECIES.$ASSEMBLY.dna.primary_assembly.fa.gz \ |
| 30 | + --output gff3.sum https://ftp.ensembl.org/pub/release-$ENSEMBL/gff3/${SPECIES,,}/CHECKSUMS \ |
| 31 | + --remote-name https://ftp.ensembl.org/pub/release-$ENSEMBL/gff3/${SPECIES,,}/$SPECIES.$ASSEMBLY.$ENSEMBL.gff3.gz |
| 32 | +check_sum $SPECIES.$ASSEMBLY.dna.primary_assembly.fa.gz fasta.sum |
| 33 | +check_sum $SPECIES.$ASSEMBLY.$ENSEMBL.gff3.gz gff3.sum |
| 34 | + |
| 35 | +# Could check if the regulation directory exists (not 404) but that's another network request |
| 36 | +if [[ $SPECIES == Homo_sapiens || $SPECIES == Mus_musculus ]]; then |
| 37 | + wget2 --ca-certificate ensembl.org.chained.crt \ |
| 38 | + --execute robots=off --recursive --level=1 --no-parent --no-directories \ |
| 39 | + --accept "CHECKSUMS" --accept "*.gff.gz" \ |
| 40 | + https://ftp.ensembl.org/pub/release-$ENSEMBL/regulation/${SPECIES,,}/ |
| 41 | + DOWNLOAD=./${SPECIES,,}.$ASSEMBLY.Regulatory_Build.regulatory_features.*.gff.gz |
| 42 | + if [[ $(md5sum $DOWNLOAD) != $(grep $DOWNLOAD CHECKSUMS) ]]; then |
| 43 | + >&2 echo "Checksum mismatch for $DOWNLOAD" |
| 44 | + exit 1 |
| 45 | + fi |
| 46 | +fi |
0 commit comments