1
1
import argparse
2
2
import functools
3
3
import multiprocessing
4
+ import os
4
5
import textwrap
6
+ from hashlib import sha256
5
7
from multiprocessing import Manager , Pool
6
8
7
9
import pandas as pd
8
10
import plotly .express as px
9
11
import streamlit as st
10
12
from datasets import get_dataset_infos
13
+ from datasets .info import DatasetInfosDict
11
14
from pygments import highlight
12
15
from pygments .formatters import HtmlFormatter
13
16
from pygments .lexers import DjangoLexer
14
- from templates import INCLUDED_USERS
15
17
18
+ from promptsource import DEFAULT_PROMPTSOURCE_CACHE_HOME
16
19
from promptsource .session import _get_state
17
- from promptsource .templates import DatasetTemplates , Template , TemplateCollection
20
+ from promptsource .templates import INCLUDED_USERS , DatasetTemplates , Template , TemplateCollection
18
21
from promptsource .utils import (
19
22
get_dataset ,
20
23
get_dataset_confs ,
25
28
)
26
29
27
30
31
+ DATASET_INFOS_CACHE_DIR = os .path .join (DEFAULT_PROMPTSOURCE_CACHE_HOME , "DATASET_INFOS" )
32
+ os .makedirs (DATASET_INFOS_CACHE_DIR , exist_ok = True )
33
+
28
34
# Python 3.8 switched the default start method from fork to spawn. OS X also has
29
35
# some issues related to fork, eee, e.g., https://github.com./bigscience-workshop/promptsource/issues/572
30
36
# so we make sure we always use spawn for consistency
@@ -38,7 +44,17 @@ def get_infos(all_infos, d_name):
38
44
:param all_infos: multiprocess-safe dictionary
39
45
:param d_name: dataset name
40
46
"""
41
- all_infos [d_name ] = get_dataset_infos (d_name )
47
+ d_name_bytes = d_name .encode ("utf-8" )
48
+ d_name_hash = sha256 (d_name_bytes )
49
+ foldername = os .path .join (DATASET_INFOS_CACHE_DIR , d_name_hash .hexdigest ())
50
+ if os .path .isdir (foldername ):
51
+ infos_dict = DatasetInfosDict .from_directory (foldername )
52
+ else :
53
+ infos = get_dataset_infos (d_name )
54
+ infos_dict = DatasetInfosDict (infos )
55
+ os .makedirs (foldername )
56
+ infos_dict .write_to_directory (foldername )
57
+ all_infos [d_name ] = infos_dict
42
58
43
59
44
60
# add an argument for read-only
@@ -181,11 +197,13 @@ def show_text(t, width=WIDTH, with_markdown=False):
181
197
else :
182
198
subset_infos = infos [subset_name ]
183
199
184
- split_sizes = {k : v .num_examples for k , v in subset_infos .splits .items ()}
200
+ try :
201
+ split_sizes = {k : v .num_examples for k , v in subset_infos .splits .items ()}
202
+ except Exception :
203
+ # Fixing bug in some community datasets.
204
+ # For simplicity, just filling `split_sizes` with nothing, so the displayed split sizes will be 0.
205
+ split_sizes = {}
185
206
else :
186
- # Zaid/coqa_expanded and Zaid/quac_expanded don't have dataset_infos.json
187
- # so infos is an empty dic, and `infos[list(infos.keys())[0]]` raises an error
188
- # For simplicity, just filling `split_sizes` with nothing, so the displayed split sizes will be 0.
189
207
split_sizes = {}
190
208
191
209
# Collect template counts, original task counts and names
0 commit comments