ai-forever
diff --git a/‎DPF/connectors/s3_connector.py
Lines changed: 5 additions & 22 deletions b/‎DPF/connectors/s3_connector.py
Lines changed: 5 additions & 22 deletions
diff --git a/‎DPF/dataset_reader.py
Lines changed: 1 addition & 12 deletions b/‎DPF/dataset_reader.py
Lines changed: 1 addition & 12 deletions
diff --git a/‎DPF/filters/complex_filter.py
Lines changed: 3 additions & 10 deletions b/‎DPF/filters/complex_filter.py
Lines changed: 3 additions & 10 deletions
diff --git a/‎DPF/filters/images/complexity_filter.py
Lines changed: 7 additions & 7 deletions b/‎DPF/filters/images/complexity_filter.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎DPF/filters/multigpu_filter.py
Lines changed: 4 additions & 27 deletions b/‎DPF/filters/multigpu_filter.py
Lines changed: 4 additions & 27 deletions
diff --git a/‎DPF/filters/videos/cogvlm2_filter.py
Lines changed: 21 additions & 20 deletions b/‎DPF/filters/videos/cogvlm2_filter.py
Lines changed: 21 additions & 20 deletions
@@ -37,29 +37,12 @@ def _preprocess_filepath(path: str) -> str:
 
     def read_file(self, filepath: str, binary: bool) -> io.BytesIO:
         mode = "rb" if binary else "rt"
-        if '.tar' in filepath and '?tar_offset=' in filepath and '?size=' in filepath:
-            filepath = self._preprocess_filepath(filepath)
-            offset = filepath.split('?tar_offset=')[1].split('?size=')[0]
-            size = filepath.split('?size=')[1]
-            filepath = filepath.split('?')[0]
-            offset = int(offset)
-            size = int(size)
-            s3 = self.s3client._get_client()
-            range_header = "bytes=%d-%d" % (offset, offset + size - 1)
-            bucket_name = filepath.split('/')[0]
-            tar_key = filepath.replace(bucket_name, '')[1:]
-            video_obj = s3.get_object(Bucket=bucket_name, Key=tar_key, Range=range_header)
-            res = video_obj["Body"].read()
+        with self.s3client.open(self._preprocess_filepath(filepath), mode=mode) as f:
             if mode == "rb":
-                res = io.BytesIO(res)
+                res = io.BytesIO(f.read())
                 res.seek(0)
-        else:
-            with self.s3client.open(self._preprocess_filepath(filepath), mode=mode) as f:
-                if mode == "rb":
-                    res = io.BytesIO(f.read())
-                    res.seek(0)
-                else:
-                    res = f.read()
+            else:
+                res = f.read()
         return res
 
     def save_file(
@@ -95,4 +78,4 @@ def join(self, *args: str) -> str:
                 path += arg
             else:
                 path += arg+'/'
-        return path[:-1]
+        return path[:-1]
@@ -2,7 +2,6 @@
 from typing import Optional, Union
 
 import pandas as pd
-import numpy as np
 from tqdm.contrib.concurrent import process_map
 
 from DPF.configs import (
@@ -43,7 +42,6 @@ def __init__(self, connector: Optional[Connector] = None):
         if connector is None:
             connector = LocalConnector()
         self.connector = connector
-        self.local_connector = LocalConnector()
 
     def _read_and_validate_dataframes(
         self,
@@ -272,10 +270,7 @@ def read_files(
             Instance of FilesDatasetProcessor dataset
         """
         table_path = config.table_path.rstrip("/")
-        try:
-            df = self.connector.read_dataframe(table_path)
-        except:
-            df = self.local_connector.read_dataframe(table_path)
+        df = self.connector.read_dataframe(table_path)
 
         required_columns = list(config.user_column2default_column.keys())
         column_set = set(df.columns.tolist())
@@ -293,12 +288,6 @@ def read_files(
                 path_col = datatype.modality.path_column
                 df[path_col] = df[path_col].apply(lambda x: self.connector.join(config.base_path, x))
 
-                # process .tar files with offsets
-                for i, row in df.iterrows():
-                    if isinstance(df.at[i,'tar_offset'], np.int64) and isinstance(df.at[i,'size'], np.int64):
-                        df.at[i, path_col] += f'?tar_offset={df.at[i,"tar_offset"]}?size={df.at[i,"size"]}'
-
-
         return FilesDatasetProcessor(
             connector=self.connector,
             df=df,
 
@@ -16,21 +16,14 @@ class ComplexDataFilter(DataFilter):
 
     def __init__(
         self,
-        datafilters,
-        kwargs,
+        datafilters: list[DataFilter],
         workers: int,
         pbar: bool = True,
-        _pbar_position: int = 0,
-        device = 'cuda:0'
+        _pbar_position: int = 0
     ):
         super().__init__(pbar, _pbar_position)
-        self.datafilters = []
+        self.datafilters = datafilters
         self.workers = workers
-        self.device = device
-
-        for filter, kwarg in zip(datafilters, kwargs):
-            kwarg['device'] = self.device
-            self.datafilters.append(filter(**kwarg))
 
         assert len(self.datafilters) > 0
         assert all(
 
@@ -1,15 +1,15 @@
 import os
 from typing import Any
 from urllib.request import urlretrieve
+
 import numpy as np
 import torch
+from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
 
-from ...types import ModalityToDataMapping
 from DPF.utils import read_image_rgb_from_bytes
-from .img_filter import ImageFilter
-
-from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
 
+from ...types import ModalityToDataMapping
+from .img_filter import ImageFilter
 
 WEIGHTS_URL = {'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth',
                'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
@@ -57,7 +57,7 @@ def __init__(
         self.model_name = model_name
         self.weights_folder = weights_folder
         self.points_per_side = points_per_side
-        
+
         # Download checkpoints
         path_to_model = os.path.join(self.weights_folder, self.model_name + '.pth')
         if not os.path.exists(path_to_model):
@@ -67,7 +67,7 @@ def __init__(
         sam = sam_model_registry[self.model_name](checkpoint=path_to_model)
         sam = sam.to(torch.device(self.device))
         self.mask_generator = SamAutomaticMaskGenerator(
-                                sam, points_per_batch=batch_size, 
+                                sam, points_per_batch=batch_size,
                                 points_per_side=points_per_side
                                 )
 
@@ -111,7 +111,7 @@ def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
                 mean_area = np.mean(areas) / hw
             else:
                 max_area = mean_area = 0
-                    
+
             df_batch_labels["complexity_num_segments"].extend([num_segments])
             df_batch_labels["complexity_max_segment_area"].extend([max_area])
             df_batch_labels["complexity_mean_segment_area"].extend([mean_area])
 
@@ -1,6 +1,6 @@
 import multiprocessing
 from multiprocessing import Manager
-from typing import Any, Union, Optional, Callable
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -60,9 +60,9 @@ def __init__(
         ----------
         devices: list[Union[torch.device, str]]
             List of devices to run datafilter on
-        datafilter_class: type[DataFilter]
+        datafilter_class: Optional[type[DataFilter]] = None
             Class of datafilter to use
-        datafilter_params: dict[str, Any]
+        datafilter_params: Optional[dict[str, Any]] = None
             Parameters for datafilter_class initialization
         datafilter_init_fn: Optional[Callable[[int, Union[str, torch.device], dict[str, Any]], DataFilter]] = None
             Initialization function for a datafilter. Takes _pbar_position as first arg and device as a second arg
@@ -77,11 +77,6 @@ def __init__(
         self.devices = devices
         self.num_parts = len(devices)
 
-        self.filters = []
-        for i in range(self.num_parts):
-            self.filters.append(datafilter_class(**datafilter_params, _pbar_position=i, device=devices[i]))
-            self.filters[i]._created_by_multigpu_data_filter = True
-
         # getting result columns names
         if self.datafilter_init_fn:
             datafilter = self.datafilter_init_fn(0, devices[0], self.datafilter_init_fn_kwargs)
@@ -146,7 +141,7 @@ def run(
         processes = []
         context = multiprocessing.get_context('spawn')
         for param in params:
-            p = context.Process(target=self.run_one_process, args=param)
+            p = context.Process(target=run_one_process, args=param)
             p.start()
             processes.append(p)
 
@@ -156,21 +151,3 @@ def run(
         res_df = pd.concat(shared_results)
         res_df.sort_index(inplace=True)
         return res_df
-    
-    
-    def run_one_process(
-        self,
-        config: DatasetConfig,
-        connector: Connector,
-        df: pd.DataFrame,
-        i: int,
-        index: pd.Series,
-        results: list[pd.DataFrame],
-        filter_run_kwargs: dict[str, Any]
-    ) -> None:
-        reader = DatasetReader(connector=connector)
-        processor = reader.from_df(config, df)
-        processor.apply_data_filter(self.filters[i], **filter_run_kwargs)
-        res = processor.df
-        res.set_index(index, inplace=True)
-        results.append(res)
@@ -1,15 +1,15 @@
+import re
 from io import BytesIO
 from typing import Any
 
-from DPF.types import ModalityToDataMapping
-
-from .video_filter import VideoFilter
 import numpy as np
 import torch
 from decord import VideoReader, bridge
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import re
 
+from DPF.types import ModalityToDataMapping
+
+from .video_filter import VideoFilter
 
 prompt_templates = {
     'detailed_video': 'Describe this video and its style in a very detailed manner',
@@ -33,25 +33,25 @@
 ]
 
 
-def clean_with_regex(caption):
-    lower_caption = str(caption).lower().strip() 
-    for re_compiled, replacement in compiled_regexs: 
-        iterator = reversed(list(re_compiled.finditer(lower_caption))) 
-        for match in iterator: 
-            pos = list(match.span()) 
+def clean_with_regex(caption: str) -> str:
+    lower_caption = str(caption).lower().strip()
+    for re_compiled, replacement in compiled_regexs:
+        iterator = reversed(list(re_compiled.finditer(lower_caption)))
+        for match in iterator:
+            pos = list(match.span())
             caption = caption[:pos[0]] + replacement + caption[pos[1]:]
             lower_caption = str(caption).lower().strip()
-            
+
     if caption.count('-') > 2:
         split_captions = []
         for split_caption in caption.split():
             if split_caption.count('-') > 2:
                 split_caption = re.sub(r'-', ' ', split_caption)
             split_captions.append(split_caption)
         caption = ' '.join(split_captions)
-        
+
     caption = caption.strip('—-:/+=|@#&*')
-        
+
     return caption.strip()
 
 
@@ -156,8 +156,8 @@ def preprocess_data(
     ) -> Any:
         key = metadata[self.key_column]
         video_file = BytesIO(modality2data['video'])
-        video_file = self.load_video(video_file, strategy=self.strategy)
-        return key, video_file
+        loaded_video_file = self.load_video(video_file, strategy=self.strategy)
+        return key, loaded_video_file
 
     def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
         df_batch_labels = self._get_dict_from_schema()
@@ -196,26 +196,27 @@ def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
         return df_batch_labels
 
 
-    def load_video(self, video_path, strategy='chat'):
+    def load_video(self, video_path: BytesIO, strategy: str = 'chat') -> torch.Tensor:
         bridge.set_bridge('torch')
         num_frames = self.num_frames
 
         decord_vr = VideoReader(uri=video_path)
-        frame_id_list = None
         total_frames = len(decord_vr)
         if strategy == 'base':
             frame_id_list = np.linspace(0, total_frames - 1, num_frames, dtype=int)
         elif strategy == 'chat':
             timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
             timestamps = [i[0] for i in timestamps]
             max_second = round(max(timestamps)) + 1
-            frame_id_list = []
+            frame_id_list = []  # type: ignore
             for second in range(max_second):
                 closest_num = min(timestamps, key=lambda x: abs(x - second))
                 index = timestamps.index(closest_num)
-                frame_id_list.append(index)
+                frame_id_list.append(index)  # type: ignore
                 if len(frame_id_list) >= num_frames:
                     break
-        video_data = decord_vr.get_batch(frame_id_list)
+        else:
+            frame_id_list = None
+        video_data: torch.Tensor = decord_vr.get_batch(frame_id_list)
         video_data = video_data.permute(3, 0, 1, 2)
         return video_data