From dfe2356c577921b0a1185c9166a898a66b72e51f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A9=E8=BF=9B?= Date: Wed, 21 Feb 2024 11:03:38 +0800 Subject: [PATCH] update data code --- .gitignore | 1 + src/data/ data_load.py | 51 ++++++++++++++++++++++++++++ src/data/data_preprocess.py | 68 +++++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100755 src/data/ data_load.py create mode 100755 src/data/data_preprocess.py diff --git a/.gitignore b/.gitignore index 68b28a8..6f859b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ **/__pycache__ .DS_Store data/ +!src/data/ .pyc __pycache__ start_job.py diff --git a/src/data/ data_load.py b/src/data/ data_load.py new file mode 100755 index 0000000..80ac57b --- /dev/null +++ b/src/data/ data_load.py @@ -0,0 +1,51 @@ +import json +import jsonlines +import os +import pandas as pd + +from loguru import logger + +from src.hparams.evaluate_args import EvaluateArguments +from src.data.data_preprocess import preprocess + + +def load_all_dataset(eval_args: EvaluateArguments): + ''' + Load all eval dataset + ''' + # get fp for eval dataset + dataset_name_list = eval_args.eval_dataset_list + eval_dataset_fp_conf_path = eval_args.eval_dataset_fp_conf_path + + with open(eval_dataset_fp_conf_path, 'r') as f: + dataset_fn_dict = json.load(f) + + data_dir = eval_args.data_path + + logger.info(dataset_name_list) + if len(dataset_name_list) == 1 and dataset_name_list[0] == 'all': + dataset_name_list = dataset_fn_dict.keys() + dataset_fp_list = [data_dir + os.path.sep + eval_args.eval_language + os.path.sep + eval_args.eval_dataset_type + os.path.sep + dataset_fn_dict[i] for i in dataset_name_list] + + logger.info('Start load and preprocess dataset') + all_dataset = {} + for dataset_name in dataset_name_list: + dataset_fp = data_dir + os.path.sep + eval_args.eval_language + os.path.sep + eval_args.eval_dataset_type + os.path.sep + dataset_fn_dict[dataset_name] + df = pd.read_csv(dataset_fp) + + # Read dev data if doing few-shot test + df_dev = None + if eval_args.k_shot > 0: + dev_dataset_fp = data_dir + os.path.sep + eval_args.eval_language + os.path.sep + 'dev' + os.path.sep + dataset_fn_dict[dataset_name] + df_dev = pd.read_csv(dev_dataset_fp) + + all_dataset[dataset_name] = preprocess(df, eval_args, df_dev=df_dev) + logger.info('Load success, dataset_name={}, dataset_file_path={}, dataset question count={}'.format(dataset_name, + dataset_fp, + len(all_dataset[dataset_name]))) + return all_dataset + +if __name__ == '__main__': + a = os.path.split(os.path.realpath(__file__))[0] + b = os.path.abspath(os.path.dirname(a)+os.path.sep+"../data") + logger.debug(b) \ No newline at end of file diff --git a/src/data/data_preprocess.py b/src/data/data_preprocess.py new file mode 100755 index 0000000..39a8082 --- /dev/null +++ b/src/data/data_preprocess.py @@ -0,0 +1,68 @@ +import pandas as pd +from loguru import logger + + +def preprocess(df: pd.DataFrame, eval_args, df_dev: pd.DataFrame = None): + ''' + Preprocess df and generate final dict + ''' + question_prompt = '''以下是关于开发运维领域的单项选择题,请选出其中的正确答案。请直接输出选项。\n''' + + if eval_args.k_shot > 0 and df_dev is not None: + # uppercase to lowercase + df_dev.rename(columns={ + 'Question': 'question', + 'Answer': 'answer' + }, inplace=True) + + prefix = '' + + for idx in range(eval_args.k_shot): + question = df_dev['question'].iloc[idx] + prefix = prefix + question_prompt + '问题:' + question + '\n' + + for option in ['A', 'B', 'C', 'D']: + if df_dev[option].iloc[idx]: + prefix += '{}. {}\n'.format(option, df_dev[option].iloc[idx]) + prefix += '答案:{}\n'.format(df_dev['answer'].iloc[idx].strip().upper()) + prefix = prefix + question_prompt + res = preprocess_question(df, prefix) + else: + res = preprocess_question(df, question_prompt) + + return res + +def preprocess_question(df: pd.DataFrame, prefix: str = ''): + ''' + Preprocess df and generate final dict + ''' + res = [] + + # uppercase to lowercase + df.rename(columns={ + 'Question': 'question', + 'Answer': 'answer' + }, inplace=True) + + for idx in range(df.shape[0]): + to_append = { + 'question': df['question'].iloc[idx], + 'options': [], + 'answer': df['answer'].iloc[idx].strip().upper() + } + question = df['question'].iloc[idx] + + query = prefix + '''问题:{question}\n'''.format(question=question) + + for option in ['A', 'B', 'C', 'D']: + if df[option].iloc[idx]: + to_append['options'].append(option) + to_append[option] = df[option].iloc[idx] + to_add = '{}. {}\n'.format(option, df[option].iloc[idx]) + query += to_add + + to_add = '答案:' + query += to_add + to_append['query'] = query + res.append(to_append) + return res