|
1 | 1 | # -*- coding: utf-8 -*-
|
2 | 2 |
|
| 3 | +import mmap |
3 | 4 | import os
|
| 5 | +import pickle |
4 | 6 | import sys
|
5 | 7 | import unicodedata
|
6 | 8 | import urllib
|
7 | 9 | import zipfile
|
8 |
| -from typing import Dict, List, Tuple, Union |
| 10 | +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union |
9 | 11 |
|
10 | 12 | import torch
|
11 | 13 | from omegaconf import DictConfig, OmegaConf
|
12 | 14 | from supar.utils.common import CACHE
|
| 15 | +from supar.utils.logging import progress_bar |
13 | 16 |
|
14 | 17 |
|
15 | 18 | def ispunct(token: str) -> bool:
|
@@ -265,6 +268,33 @@ def download(url: str, reload: bool = False) -> str:
|
265 | 268 | return path
|
266 | 269 |
|
267 | 270 |
|
| 271 | +def binarize(data: Iterable, fbin: str = None) -> None: |
| 272 | + start, meta = 0, [] |
| 273 | + with open(fbin, 'wb') as f: |
| 274 | + for s in progress_bar(data): |
| 275 | + bytes = pickle.dumps(s) |
| 276 | + f.write(bytes) |
| 277 | + end = start + len(bytes) |
| 278 | + meta.append((start, end)) |
| 279 | + start = end |
| 280 | + meta = pickle.dumps(torch.tensor(meta)) |
| 281 | + # append the meta data to the end of the bin file |
| 282 | + f.write(meta) |
| 283 | + # record the positions of the meta data |
| 284 | + f.write(pickle.dumps(torch.tensor((start, start + len(meta))))) |
| 285 | + |
| 286 | + |
| 287 | +def debinarize(fbin: str, offset: Optional[int] = 0, length: Optional[int] = 0, meta: bool = False) -> Any: |
| 288 | + with open(fbin, 'rb') as f: |
| 289 | + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: |
| 290 | + if meta: |
| 291 | + length = len(pickle.dumps(torch.tensor((offset, length)))) |
| 292 | + mm.seek(-length, os.SEEK_END) |
| 293 | + offset, length = pickle.loads(mm.read(length)).tolist() |
| 294 | + mm.seek(offset) |
| 295 | + return pickle.loads(mm.read(length)) |
| 296 | + |
| 297 | + |
268 | 298 | def get_rng_state():
|
269 | 299 | state = {'rng_state': torch.get_rng_state()}
|
270 | 300 | if torch.cuda.is_available():
|
|
0 commit comments