-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_data_loader.py
More file actions
145 lines (126 loc) · 6.25 KB
/
Copy pathrun_data_loader.py
File metadata and controls
145 lines (126 loc) · 6.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import time
import fire
import json
from typing import Optional, Union
import pandas as pd
def load_json(
filepath: str,
mode: str = "r",
encoding: str = "utf-8",
verbose: bool = False,
errors: Optional[str] = None,
) -> Union[list, dict]:
results = []
if os.path.isfile(filepath):
if verbose:
print(f">>> [load_json] {filepath}")
with open(filepath, mode, encoding=encoding, errors=errors) as fp_in:
results = json.load(fp_in)
else:
if verbose:
print(f">>> [load_json] filepath does not exist: {filepath}")
return results
def load_jsonl(
filepath: str,
mode: str = "r",
encoding: str = "utf-8",
verbose: bool = False,
errors: Optional[str] = None,
) -> list:
results = []
if os.path.isfile(filepath):
if verbose:
print(f">>> [load_jsonl] {filepath}")
with open(filepath, mode, encoding=encoding, errors=errors) as fp_in:
for line in fp_in:
results.append(json.loads(line))
else:
if verbose:
print(f">>> [load_jsonl] filepath does not exist: {filepath}")
return results
def load_parquet(
filepath: str,
engine="auto", # "auto", "pyarrow", "fastparquet"
# mode: str = "r",
# encoding: str = "utf-8",
verbose: bool = False,
# errors: Optional[str] = None,
) -> pd.DataFrame:
results = []
if os.path.isfile(filepath):
if verbose:
print(f">>> [load_parquet] {filepath}")
# with open(filepath, mode, encoding=encoding, errors=errors) as fp_in:
# pass
# Load a parquet object from the file path, returning a DataFrame.
results = pd.read_parquet(filepath, engine=engine)
else:
if verbose:
print(f">>> [load_parquet] filepath does not exist: {filepath}")
return results
def main() -> None:
timer_start = time.perf_counter()
for eval_set, data_split in [("all", "train"), ("all", "test"), ("gem", "test")]:
bench_data_dir = os.path.join("data/intent_grasp", eval_set)
assert os.path.isdir(bench_data_dir), f"IntentGrasp evaluation data dir not found: {bench_data_dir}"
# Load the metadata
meta_filepath = os.path.join(bench_data_dir, f"metadata.json")
assert os.path.isfile(meta_filepath)
metadata = load_json(meta_filepath, verbose=False)
assert isinstance(metadata, dict)
# Load the data list
eval_data_fp = os.path.join(bench_data_dir, f"{data_split}.jsonl")
# eval_data_fp = os.path.join(bench_data_dir, f"{data_split}.parquet")
assert os.path.isfile(eval_data_fp)
eval_data = load_jsonl(eval_data_fp, verbose=False)
# eval_data = load_parquet(eval_data_fp, verbose=False)
assert isinstance(eval_data, list)
# Check data types & format for each data item
for data_item in eval_data:
assert isinstance(data_item, dict) and len(data_item) == 8
assert "id" in data_item and isinstance(data_item["id"], str) and len(data_item["id"]) > 0
assert "speaker" in data_item and isinstance(data_item["speaker"], str) and len(data_item["speaker"]) > 0
assert "context" in data_item and isinstance(data_item["context"], str) and len(data_item["context"]) > 0
assert "question" in data_item and isinstance(data_item["question"], str) and len(data_item["question"]) > 0
assert "options" in data_item and isinstance(data_item["options"], list)
assert "answer_intent" in data_item and isinstance(data_item["answer_intent"], list)
assert "answer_index" in data_item and isinstance(data_item["answer_index"], list)
cur_options = data_item["options"]
cur_answer_intent = data_item["answer_intent"]
cur_answer_index = data_item["answer_index"]
assert 1 <= len(cur_answer_intent) == len(cur_answer_index) <= len(cur_options) <= 10
cur_options_set = set(cur_options)
for _intent in cur_answer_intent: # the correct intent answer must be in the options list
assert isinstance(_intent, str) and _intent in cur_options_set
assert "metadata" in data_item
cur_metadata = data_item["metadata"]
assert isinstance(cur_metadata, dict) and len(cur_metadata) == 9
assert ("id" in cur_metadata and isinstance(cur_metadata["id"], str) and
cur_metadata["id"] == data_item["id"])
assert ("paper_year" in cur_metadata and isinstance(cur_metadata["paper_year"], int) and
cur_metadata["paper_year"] > 0)
assert ("original_task" in cur_metadata and isinstance(cur_metadata["original_task"], str) and
len(cur_metadata["original_task"]) > 0)
assert ("original_split" in cur_metadata and isinstance(cur_metadata["original_split"], str) and
len(cur_metadata["original_split"]) > 0)
assert ("text_form" in cur_metadata and isinstance(cur_metadata["text_form"], str) and
len(cur_metadata["text_form"]) > 0)
assert ("intent_type" in cur_metadata and isinstance(cur_metadata["intent_type"], str) and
len(cur_metadata["intent_type"]) > 0)
assert "is_synthetic" in cur_metadata and isinstance(cur_metadata["is_synthetic"], bool)
assert "is_sensitive" in cur_metadata and isinstance(cur_metadata["is_sensitive"], bool)
assert ("domain_topic" in cur_metadata and isinstance(cur_metadata["domain_topic"], list) and
len(cur_metadata["domain_topic"]) > 0)
for d_t in cur_metadata["domain_topic"]:
assert isinstance(d_t, list) and len(d_t) == 2
assert isinstance(d_t[0], str) and len(d_t[0]) > 0
assert isinstance(d_t[1], str) # the topic can be empty
print(f">>> Done Check: {bench_data_dir}")
timer_end = time.perf_counter()
total_sec = timer_end - timer_start
print(f"Total Running Time: {total_sec:.1f} sec ({total_sec / 60:.1f} min; {total_sec / 3600:.2f} h)")
if __name__ == "__main__":
fire.Fire(main)