File size: 1,054 Bytes
1a18f22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | """On h800: download GenSegDataset tars from HF (via proxy+token), extract into Data/,
then remove the tars. Produces the processed_unified layout under Data/."""
import os, glob, tarfile
from huggingface_hub import snapshot_download
BASE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Data"
TARS = os.path.join(BASE, "_tars")
print("[1] downloading tars ...", flush=True)
snapshot_download("MaybeRichard/GenSegDataset", repo_type="dataset",
allow_patterns=["*.tar", "README.md"], local_dir=TARS)
print("[2] extracting ...", flush=True)
for t in sorted(glob.glob(os.path.join(TARS, "*.tar"))):
print(" extract", os.path.basename(t), flush=True)
with tarfile.open(t) as tf:
tf.extractall(BASE)
rd = os.path.join(TARS, "README.md")
if os.path.isfile(rd):
os.replace(rd, os.path.join(BASE, "README.md"))
print("[3] cleanup tars ...", flush=True)
for t in glob.glob(os.path.join(TARS, "*.tar")):
os.remove(t)
try:
os.rmdir(TARS)
except OSError:
pass
print("DONE_DATA", flush=True)
|