File size: 1,054 Bytes
1a18f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
"""On h800: download GenSegDataset tars from HF (via proxy+token), extract into Data/,
then remove the tars. Produces the processed_unified layout under Data/."""
import os, glob, tarfile
from huggingface_hub import snapshot_download

BASE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Data"
TARS = os.path.join(BASE, "_tars")

print("[1] downloading tars ...", flush=True)
snapshot_download("MaybeRichard/GenSegDataset", repo_type="dataset",
                  allow_patterns=["*.tar", "README.md"], local_dir=TARS)

print("[2] extracting ...", flush=True)
for t in sorted(glob.glob(os.path.join(TARS, "*.tar"))):
    print("  extract", os.path.basename(t), flush=True)
    with tarfile.open(t) as tf:
        tf.extractall(BASE)

rd = os.path.join(TARS, "README.md")
if os.path.isfile(rd):
    os.replace(rd, os.path.join(BASE, "README.md"))

print("[3] cleanup tars ...", flush=True)
for t in glob.glob(os.path.join(TARS, "*.tar")):
    os.remove(t)
try:
    os.rmdir(TARS)
except OSError:
    pass

print("DONE_DATA", flush=True)