Skip to content

update data code #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
**/__pycache__
.DS_Store
data/
!src/data/
.pyc
__pycache__
start_job.py
Expand Down
51 changes: 51 additions & 0 deletions src/data/ data_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import json
import jsonlines
import os
import pandas as pd

from loguru import logger

from src.hparams.evaluate_args import EvaluateArguments
from src.data.data_preprocess import preprocess


def load_all_dataset(eval_args: EvaluateArguments):
'''
Load all eval dataset
'''
# get fp for eval dataset
dataset_name_list = eval_args.eval_dataset_list
eval_dataset_fp_conf_path = eval_args.eval_dataset_fp_conf_path

with open(eval_dataset_fp_conf_path, 'r') as f:
dataset_fn_dict = json.load(f)

data_dir = eval_args.data_path

logger.info(dataset_name_list)
if len(dataset_name_list) == 1 and dataset_name_list[0] == 'all':
dataset_name_list = dataset_fn_dict.keys()
dataset_fp_list = [data_dir + os.path.sep + eval_args.eval_language + os.path.sep + eval_args.eval_dataset_type + os.path.sep + dataset_fn_dict[i] for i in dataset_name_list]

logger.info('Start load and preprocess dataset')
all_dataset = {}
for dataset_name in dataset_name_list:
dataset_fp = data_dir + os.path.sep + eval_args.eval_language + os.path.sep + eval_args.eval_dataset_type + os.path.sep + dataset_fn_dict[dataset_name]
df = pd.read_csv(dataset_fp)

# Read dev data if doing few-shot test
df_dev = None
if eval_args.k_shot > 0:
dev_dataset_fp = data_dir + os.path.sep + eval_args.eval_language + os.path.sep + 'dev' + os.path.sep + dataset_fn_dict[dataset_name]
df_dev = pd.read_csv(dev_dataset_fp)

all_dataset[dataset_name] = preprocess(df, eval_args, df_dev=df_dev)
logger.info('Load success, dataset_name={}, dataset_file_path={}, dataset question count={}'.format(dataset_name,
dataset_fp,
len(all_dataset[dataset_name])))
return all_dataset

if __name__ == '__main__':
a = os.path.split(os.path.realpath(__file__))[0]
b = os.path.abspath(os.path.dirname(a)+os.path.sep+"../data")
logger.debug(b)
68 changes: 68 additions & 0 deletions src/data/data_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import pandas as pd
from loguru import logger


def preprocess(df: pd.DataFrame, eval_args, df_dev: pd.DataFrame = None):
'''
Preprocess df and generate final dict
'''
question_prompt = '''以下是关于开发运维领域的单项选择题,请选出其中的正确答案。请直接输出选项。\n'''

if eval_args.k_shot > 0 and df_dev is not None:
# uppercase to lowercase
df_dev.rename(columns={
'Question': 'question',
'Answer': 'answer'
}, inplace=True)

prefix = ''

for idx in range(eval_args.k_shot):
question = df_dev['question'].iloc[idx]
prefix = prefix + question_prompt + '问题:' + question + '\n'

for option in ['A', 'B', 'C', 'D']:
if df_dev[option].iloc[idx]:
prefix += '{}. {}\n'.format(option, df_dev[option].iloc[idx])
prefix += '答案:{}\n'.format(df_dev['answer'].iloc[idx].strip().upper())
prefix = prefix + question_prompt
res = preprocess_question(df, prefix)
else:
res = preprocess_question(df, question_prompt)

return res

def preprocess_question(df: pd.DataFrame, prefix: str = ''):
'''
Preprocess df and generate final dict
'''
res = []

# uppercase to lowercase
df.rename(columns={
'Question': 'question',
'Answer': 'answer'
}, inplace=True)

for idx in range(df.shape[0]):
to_append = {
'question': df['question'].iloc[idx],
'options': [],
'answer': df['answer'].iloc[idx].strip().upper()
}
question = df['question'].iloc[idx]

query = prefix + '''问题:{question}\n'''.format(question=question)

for option in ['A', 'B', 'C', 'D']:
if df[option].iloc[idx]:
to_append['options'].append(option)
to_append[option] = df[option].iloc[idx]
to_add = '{}. {}\n'.format(option, df[option].iloc[idx])
query += to_add

to_add = '答案:'
query += to_add
to_append['query'] = query
res.append(to_append)
return res