Skip to content

Commit d34a1a5

Browse files
committed
feat: add video hash filter
1 parent 5d14d57 commit d34a1a5

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import io
2+
from typing import Any
3+
import shutil
4+
from videohash import VideoHash
5+
from uuid import uuid4
6+
import os
7+
8+
from DPF.types import ModalityToDataMapping
9+
10+
from .video_filter import VideoFilter
11+
12+
13+
14+
class VideohashFilter(VideoFilter):
15+
16+
def __init__(
17+
self,
18+
workers: int = 16,
19+
pbar: bool = True,
20+
_pbar_position: int = 0
21+
):
22+
super().__init__(pbar, _pbar_position)
23+
self.num_workers = workers
24+
25+
@property
26+
def result_columns(self) -> list[str]:
27+
return [f"video_hash"]
28+
29+
@property
30+
def dataloader_kwargs(self) -> dict[str, Any]:
31+
return {
32+
"num_workers": self.num_workers,
33+
"batch_size": 1,
34+
"drop_last": False,
35+
}
36+
37+
def preprocess_data(
38+
self,
39+
modality2data: ModalityToDataMapping,
40+
metadata: dict[str, Any]
41+
) -> Any:
42+
key = metadata[self.key_column]
43+
video_file = modality2data['video']
44+
45+
uid = str(uuid4())
46+
tmp_dir = os.path.join(os.path.abspath(os.getcwd()), uid) + os.path.sep
47+
os.makedirs(tmp_dir, exist_ok=True)
48+
49+
video_path = os.path.join(tmp_dir, 'video.mp4')
50+
with open(video_path, 'wb') as f:
51+
f.write(video_file)
52+
53+
hash_obj = VideoHash(path=video_path, storage_path=tmp_dir)
54+
shutil.rmtree(hash_obj.storage_path)
55+
os.remove(video_path)
56+
os.rmdir(tmp_dir)
57+
return key, hash_obj.hash_hex
58+
59+
def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
60+
df_batch_labels = self._get_dict_from_schema()
61+
for key, hash_hex in batch:
62+
df_batch_labels[self.key_column].append(key)
63+
df_batch_labels[self.result_columns[0]].append(hash_hex)
64+
return df_batch_labels

0 commit comments

Comments
 (0)