Description
The nlp/linker does not cache the downloaded files. This means it redownloads the (large) file each time and won't work offline.
Example :
`MaxRetryError: HTTPSConnectionPool(host='s3-us-west-2.amazonaws.com', port=443): Max retries exceeded with url: /ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x6ccd66190>: Failed to resolve 's3-us-west-2.amazonaws.com' ([Errno 8] nodename nor servname provided, or not known)"))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
Cell In[22], line 4
1 nlp = spacy.load("en_core_sci_lg")
2 # nlp.add_pipe("abbreviation_detector")
----> 4 nlp.add_pipe("scispacy_linker",
5 config={"resolve_abbreviations": True,
6 "linker_name": "umls",
7 "max_entities_per_mention": 3 # 3, #6, #4, #5
8 # , "threshold": 0.85 ## default is 0.8, paper mentions 0.99 as thresh
9 })
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/spacy/language.py:821, in Language.add_pipe(self, factory_name, name, before, after, first, last, source, config, raw_config, validate)
817 pipe_component, factory_name = self.create_pipe_from_source(
818 factory_name, source, name=name
819 )
820 else:
--> 821 pipe_component = self.create_pipe(
822 factory_name,
823 name=name,
824 config=config,
825 raw_config=raw_config,
826 validate=validate,
827 )
828 pipe_index = self._get_pipe_index(before, after, first, last)
829 self._pipe_meta[name] = self.get_factory_meta(factory_name)
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/spacy/language.py:709, in Language.create_pipe(self, factory_name, name, config, raw_config, validate)
706 cfg = {factory_name: config}
707 # We're calling the internal _fill here to avoid constructing the
708 # registered functions twice
--> 709 resolved = registry.resolve(cfg, validate=validate)
710 filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
711 filled = Config(filled)
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/confection/init.py:759, in registry.resolve(cls, config, schema, overrides, validate)
750 @classmethod
751 def resolve(
752 cls,
(...)
757 validate: bool = True,
758 ) -> Dict[str, Any]:
--> 759 resolved, _ = cls._make(
760 config, schema=schema, overrides=overrides, validate=validate, resolve=True
761 )
762 return resolved
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/confection/init.py:808, in registry._make(cls, config, schema, overrides, resolve, validate)
806 if not is_interpolated:
807 config = Config(orig_config).interpolate()
--> 808 filled, _, resolved = cls._fill(
809 config, schema, validate=validate, overrides=overrides, resolve=resolve
810 )
811 filled = Config(filled, section_order=section_order)
812 # Check that overrides didn't include invalid properties not in config
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/confection/init.py:880, in registry._fill(cls, config, schema, validate, resolve, parent, overrides)
877 getter = cls.get(reg_name, func_name)
878 # We don't want to try/except this and raise our own error
879 # here, because we want the traceback if the function fails.
--> 880 getter_result = getter(*args, **kwargs)
881 else:
882 # We're not resolving and calling the function, so replace
883 # the getter_result with a Promise class
884 getter_result = Promise(
885 registry=reg_name, name=func_name, args=args, kwargs=kwargs
886 )
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/scispacy/linking.py:85, in EntityLinker.init(self, nlp, name, candidate_generator, resolve_abbreviations, k, threshold, no_definition_threshold, filter_for_definitions, max_entities_per_mention, linker_name)
70 def init(
71 self,
72 nlp: Optional[Language] = None,
(...)
81 linker_name: Optional[str] = None,
82 ):
83 Span.set_extension("kb_ents", default=[], force=True)
---> 85 self.candidate_generator = candidate_generator or CandidateGenerator(
86 name=linker_name
87 )
88 self.resolve_abbreviations = resolve_abbreviations
89 self.k = k
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/scispacy/candidate_generation.py:222, in CandidateGenerator.init(self, ann_index, tfidf_vectorizer, ann_concept_aliases_list, kb, verbose, ef_search, name)
218 name = "umls"
220 linker_paths = DEFAULT_PATHS.get(name, UmlsLinkerPaths)
--> 222 self.ann_index = ann_index or load_approximate_nearest_neighbours_index(
223 linker_paths=linker_paths, ef_search=ef_search
224 )
225 self.vectorizer = tfidf_vectorizer or joblib.load(
226 cached_path(linker_paths.tfidf_vectorizer)
227 )
228 self.ann_concept_aliases_list = ann_concept_aliases_list or json.load(
229 open(cached_path(linker_paths.concept_aliases_list))
230 )
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/scispacy/candidate_generation.py:134, in load_approximate_nearest_neighbours_index(linker_paths, ef_search)
117 def load_approximate_nearest_neighbours_index(
118 linker_paths: LinkerPaths,
119 ef_search: int = 200,
120 ) -> FloatIndex:
121 """
122 Load an approximate nearest neighbours index from disk.
123
(...)
131 of magnitude for a small performance hit.
132 """
133 concept_alias_tfidfs = scipy.sparse.load_npz(
--> 134 cached_path(linker_paths.tfidf_vectors)
135 ).astype(numpy.float32)
136 ann_index = nmslib.init(
137 method="hnsw",
138 space="cosinesimil_sparse",
139 data_type=nmslib.DataType.SPARSE_VECTOR,
140 )
141 ann_index.addDataPointBatch(concept_alias_tfidfs)
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/scispacy/file_cache.py:39, in cached_path(url_or_filename, cache_dir)
35 parsed = urlparse(url_or_filename)
37 if parsed.scheme in ("http", "https"):
38 # URL, so get it from the cache (downloading if necessary)
---> 39 return get_from_cache(url_or_filename, cache_dir)
40 elif os.path.exists(url_or_filename):
41 # File, and it exists.
42 return url_or_filename
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/scispacy/file_cache.py:119, in get_from_cache(url, cache_dir)
115 cache_dir = DATASET_CACHE
117 os.makedirs(cache_dir, exist_ok=True)
--> 119 response = requests.head(url, allow_redirects=True)
120 if response.status_code != 200:
121 raise IOError(
122 "HEAD request failed for url {} with status code {}".format(
123 url, response.status_code
124 )
125 )
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/requests/api.py:100, in head(url, **kwargs)
89 r"""Sends a HEAD request.
90
91 :param url: URL for the new :class:Request
object.
(...)
96 :rtype: requests.Response
97 """
99 kwargs.setdefault("allow_redirects", False)
--> 100 return request("head", url, **kwargs)
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/requests/api.py:59, in request(method, url, **kwargs)
55 # By using the 'with' statement we are sure the session is closed, thus we
56 # avoid leaving sockets open which can trigger a ResourceWarning in some
57 # cases, and look like a memory leak in others.
58 with sessions.Session() as session:
---> 59 return session.request(method=method, url=url, **kwargs)
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
584 send_kwargs = {
585 "timeout": timeout,
586 "allow_redirects": allow_redirects,
587 }
588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
591 return resp
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
700 start = preferred_clock()
702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
705 # Total elapsed time of the request (approximately)
706 elapsed = preferred_clock() - start
File /opt/anaconda3/envs/Medrag/lib/python3.11/site-packages/requests/adapters.py:700, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
696 if isinstance(e.reason, _SSLError):
697 # This branch is for urllib3 v1.22 and later.
698 raise SSLError(e, request=request)
--> 700 raise ConnectionError(e, request=request)
702 except ClosedPoolError as e:
703 raise ConnectionError(e, request=request)
ConnectionError: HTTPSConnectionPool(host='s3-us-west-2.amazonaws.com', port=443): Max retries exceeded with url: /ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x6ccd66190>: Failed to resolve 's3-us-west-2.amazonaws.com' ([Errno 8] nodename nor servname provided, or not known)"))`
This does work if i;m connected to the internet, but nothing seems to be saved, i.e even rerunning in the same notebook and runtime , after turning the internet off results in the error.
I am using the latest package version. Mac M3 env. Python 3.11, conda.
This is a pretty annoying bug since it breaks stuff. The same issue also occurs for me when running in WSL2 so it doesn;t seem to be mac or admin rights specific?