Skip to content

Commit 095e600

Browse files
committed
Initial draft of new standalone download script
1 parent 9bbbad9 commit 095e600

File tree

1 file changed

+46
-0
lines changed

1 file changed

+46
-0
lines changed

src/setup/download.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
check_sum() {
6+
LINE=$(grep $1 $2)
7+
# Remove the name of the file from the checksum line.
8+
# Alternatively, we could append it to the `sum` output OR remove the extra spaces from it:
9+
# [[ $(echo $(sum $1)) != $(grep $1 $2 | awk '{print $1" "$2}' -) ]]
10+
if [[ $(sum $1) != ${LINE% $1} ]]; then
11+
>&2 echo "Checksum mismatch for $1"
12+
exit 1
13+
fi
14+
}
15+
16+
ENSEMBL="$1"
17+
SPECIES="$2"
18+
ASSEMBLY="$3"
19+
20+
# saccharomyces_cerevisiae only has toplevel
21+
22+
# Ensembl has configured its HTTPS server incorrectly, missing an intermediate certificate in the chain
23+
# This is a workaround that still verifies the certificate chain, but it would still permit an attack
24+
# from whatismychaincert.com or somebody compromising that website.
25+
curl --output ensembl.org.chained.crt 'https://whatsmychaincert.com/generate?include_leaf=1;host=ftp.ensembl.org'
26+
# URLs are case-sensitive. Directories are lowercase, files are mixed.
27+
curl --cacert ensembl.org.chained.crt --fail-early --fail \
28+
--output fasta.sum https://ftp.ensembl.org/pub/release-$ENSEMBL/fasta/${SPECIES,,}/dna/CHECKSUMS \
29+
--remote-name https://ftp.ensembl.org/pub/release-$ENSEMBL/fasta/${SPECIES,,}/dna/$SPECIES.$ASSEMBLY.dna.primary_assembly.fa.gz \
30+
--output gff3.sum https://ftp.ensembl.org/pub/release-$ENSEMBL/gff3/${SPECIES,,}/CHECKSUMS \
31+
--remote-name https://ftp.ensembl.org/pub/release-$ENSEMBL/gff3/${SPECIES,,}/$SPECIES.$ASSEMBLY.$ENSEMBL.gff3.gz
32+
check_sum $SPECIES.$ASSEMBLY.dna.primary_assembly.fa.gz fasta.sum
33+
check_sum $SPECIES.$ASSEMBLY.$ENSEMBL.gff3.gz gff3.sum
34+
35+
# Could check if the regulation directory exists (not 404) but that's another network request
36+
if [[ $SPECIES == Homo_sapiens || $SPECIES == Mus_musculus ]]; then
37+
wget2 --ca-certificate ensembl.org.chained.crt \
38+
--execute robots=off --recursive --level=1 --no-parent --no-directories \
39+
--accept "CHECKSUMS" --accept "*.gff.gz" \
40+
https://ftp.ensembl.org/pub/release-$ENSEMBL/regulation/${SPECIES,,}/
41+
DOWNLOAD=./${SPECIES,,}.$ASSEMBLY.Regulatory_Build.regulatory_features.*.gff.gz
42+
if [[ $(md5sum $DOWNLOAD) != $(grep $DOWNLOAD CHECKSUMS) ]]; then
43+
>&2 echo "Checksum mismatch for $DOWNLOAD"
44+
exit 1
45+
fi
46+
fi

0 commit comments

Comments
 (0)