southerncoder
diff --git a/‎.gitignore
Lines changed: 13 additions & 0 deletions b/‎.gitignore
Lines changed: 13 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 84 additions & 1 deletion b/‎README.md
Lines changed: 84 additions & 1 deletion
diff --git a/‎bash/beir/run_1st_retrieval.sh
Lines changed: 34 additions & 0 deletions b/‎bash/beir/run_1st_retrieval.sh
Lines changed: 34 additions & 0 deletions
diff --git a/‎bash/beir/run_convert_results.sh
Lines changed: 23 additions & 0 deletions b/‎bash/beir/run_convert_results.sh
Lines changed: 23 additions & 0 deletions
diff --git a/‎bash/beir/run_eval.sh
Lines changed: 34 additions & 0 deletions b/‎bash/beir/run_eval.sh
Lines changed: 34 additions & 0 deletions
diff --git a/‎bash/beir/run_rerank_CE.sh
Lines changed: 26 additions & 0 deletions b/‎bash/beir/run_rerank_CE.sh
Lines changed: 26 additions & 0 deletions
diff --git a/‎bash/beir/run_rerank_llm.sh
Lines changed: 37 additions & 0 deletions b/‎bash/beir/run_rerank_llm.sh
Lines changed: 37 additions & 0 deletions
diff --git a/‎bash/beir/run_train.sh
Lines changed: 27 additions & 0 deletions b/‎bash/beir/run_train.sh
Lines changed: 27 additions & 0 deletions
@@ -0,0 +1,13 @@
+datasets/*
+data/*
+scripts/__pycache__/*
+scripts/utils/__pycache__/*
+models/*
+scripts/*.ipynb
+wandb/*
+logs/*
+qrels/*
+outputs/*
+scripts/latency_test.py
+scripts/logits_reranking_test.py
+temp/*
@@ -1,5 +1,88 @@
 # FIRST: Faster Improved Listwise Reranking with Single Token Decoding
+Relevance Feeback code will be released shortly after!
 
-Code will be released soon!
 
+## Installation
+You need to install the tevatron library (original source [here](https://github.com/texttron/tevatron)) which provides the framework for retrieval.
+
+```
+conda create --name {your env name} python=3.9.18
+cd tevatron
+pip install --editable .
+pip install beir
+```
+## You need to install the vLLM library (Instruction [here](https://docs.vllm.ai/en/latest/getting_started/installation.html)) which provides optimization for LLM generation.
+
+Before running, do
+```
+export REPO_DIR=<path to this directory e.g. /shared/nas/data/m1/revanth3/exp/prf/ai2_data/workspace/repo/llm-reranker>
+```
+
+## 1. Retrieval
+Please download the precomputed BEIR encodings stored at (Link will be added shortly)
+Run the baseline Contriever retrieval using the precomputed encodings
+
+```
+bash bash/beir/run_1st_retrieval.sh <Path of precomputed BEIR encodings>
+```
+To get the baseline contriever scores and preprocess datasets, Run:
+
+```
+bash bash/beir/run_eval.sh rank
+```
+
+## 2. Reranking
+### 2a. Baseline Cross-encoder reranking
+Cross-encoder rerankig config is at `{REPO_DIR}/bash/beir/run_rerank_CE.sh`
+To run the baseline cross encoder re-ranking, run:
+```
+bash bash/beir/run_rerank.sh
+```
+### 2b. LLM Reranking
+LLM results preparation config is at `{REPO_DIR}/bash/beir/run_convert_results.sh`
+To prepare retrieval results for LLM reranking, run:
+
+```
+bash bash/beir/run_convert_results.sh
+```
+
+LLM rerankig config is at `{REPO_DIR}/bash/beir/run_rerank_llm.sh`
+To run the LLM reranking, run:
+
+```
+bash bash/beir/run_rerank_llm.sh
+```
+
+Evaluation config is at `{REPO_DIR}/bash/beir/run_eval.sh`
+To verify that ranking performance has improved from reranking, run:
+```
+bash bash/run_eval.sh rerank
+
+Set flag --suffix to "llm_FIRST_alpha" for FIRST LLM evaluation or "ce" for cross encoder reranker
+```
+
+
+## 3. Model Training
+### 3a. Training Dataset
+Converted training dataset (alphabetic IDs) is on [HF](https://huggingface.co/datasets/rryisthebest/rank_zephyr_training_data_alpha). The standard numeric training dataset can be found [here](https://huggingface.co/datasets/castorini/rank_zephyr_training_data).
+
+### 3b. Training
+We support three training objectives:
+
+- **Ranking**: The Ranking objective uses a learning-to-rank algorithm to output the logits for the highest-ranked passage ID.
+- **Generation**: The Generation objective follows the principles of Causal Language Modeling, focusing on permutation generation.
+- **Combined**: The Combined objective, which we introduce in our paper, is a novel weighted approach that seamlessly integrates both ranking and generation principles, and is the setting applied to the FIRST model.
+
+
+Training and accelerate configs are at `{REPO_DIR}/bash/run_train.sh` and `{REPO_DIR}/train_configs/accel_config.yaml`, respectively.
+
+To train the model, run:
+```
+bash bash/run_train.sh
+```
+
+To train gated model, login to Huggingface and get token access at huggingface.co/settings/tokens.
+```
+huggingface-cli login
+```
 
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Ensure the input directory is provided
+if [ -z "$1" ]; then
+    echo "Usage: $0 <input_directory>"
+    exit 1
+fi
+
+input_dir="$1"
+
+# Create necessary directories
+output_dir="${REPO_DIR}/outputs/beir"
+data_dir="${REPO_DIR}/datasets/beir"
+
+mkdir -p "$output_dir" "$data_dir"
+
+# Datasets to process
+datasets=('trec-covid') # 'climate-fever' 'dbpedia-entity' 'fever' 'fiqa' 'hotpotqa' 'msmarco'  'nfcorpus' 'nq' 'scidocs' 'scifact' 'trec-covid'
+
+# Iterate over datasets
+for dataset in "${datasets[@]}"; do
+    echo "Processing dataset: ${dataset}"
+
+    dataset_output_dir="${output_dir}/${dataset}"
+    mkdir -p "$dataset_output_dir"
+
+    python -m tevatron.faiss_retriever \
+        --query_reps "${input_dir}/${dataset}/original_query/qry.pt" \
+        --passage_reps "${input_dir}/${dataset}/original_corpus/*.pt" \
+        --depth 1000 \
+        --batch_size -1 \
+        --save_text \
+        --save_ranking_to "${dataset_output_dir}/rank.tsv"
+done
@@ -0,0 +1,23 @@
+data_dir=${REPO_DIR}/datasets/beir/
+output_dir=${REPO_DIR}/outputs/beir/
+
+# List of datasets to process
+datasets=('trec-covid') # 'climate-fever' 'fever' 'hotpotqa' 'msmarco' 'nfcorpus' 'nq' 'fiqa' 'scidocs' 'scifact' 'dbpedia-entity' 'trec-covid'
+
+# Iterate over datasets and process each one
+for datasets in "${datasets[@]}"; do
+    echo "Processing dataset: ${datasets}"
+    
+    # Execute the conversion script with error handling
+    if python "${REPO_DIR}/scripts/convert_results.py" \
+        --dataset "${datasets}" \
+        --output_dir "${output_dir}" \
+        --data_type "beir" \
+        --data_dir "${data_dir}" \
+        --top_k 100; then
+        echo "Successfully processed ${datasets}"
+    else
+        echo "Failed to process ${datasets}" >&2
+        exit 1
+    fi
+done
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Check if eval_type argument is provided
+if [ -z "$1" ]; then
+    echo "Usage: $0 <eval_type>"
+    exit 1
+fi
+
+EVAL_TYPE=$1
+DATA_DIR="${REPO_DIR}/datasets/beir/"
+OUTPUT_DIR="${REPO_DIR}/outputs/beir/"
+
+# List of datasets to process
+DATASETS=('trec-covid') # 'climate-fever' 'fever' 'hotpotqa' 'msmarco' 'nfcorpus' 'nq' 'fiqa' 'scidocs' 'scifact' 'dbpedia-entity' 'trec-covid'
+
+# Iterate over datasets and process each one
+for DATASET in "${DATASETS[@]}"; do
+    echo "Evaluating dataset: ${DATASET}"
+    
+    # Execute the evaluation script
+    # suffix: ce -> cross encoder reranker | llm_FIRST_alpha -> FIRST Model
+    if python "${REPO_DIR}/scripts/eval.py" \
+        --dataset "${DATASET}" \
+        --output_path "${OUTPUT_DIR}" \
+        --data_type "beir" \
+        --suffix "llm_FIRST_alpha" \
+        --eval_type "${EVAL_TYPE}" \
+        --data_dir "${DATA_DIR}"; then
+        echo "Successfully evaluated ${DATASET}"
+    else
+        echo "Failed to evaluate ${DATASET}" >&2
+        exit 1
+    fi
+done
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Set directories
+DATA_DIR="${REPO_DIR}/datasets/beir/"
+OUTPUT_DIR="${REPO_DIR}/outputs/beir/"
+
+# List of datasets to rerank
+DATASETS=('trec-covid') # 'climate-fever' 'fever' 'hotpotqa' 'msmarco' 'nfcorpus' 'nq' 'fiqa' 'scidocs' 'scifact' 'dbpedia-entity'
+
+# Iterate over datasets and rerank each one
+for DATASET in "${DATASETS[@]}"; do  
+    echo "Reranking dataset: ${DATASET}"
+    
+    # Execute the rerank script with error handling
+    if python "${REPO_DIR}/scripts/rerank_CE.py" \
+        --dataset "${DATASET}" \
+        --output_dir "${OUTPUT_DIR}" \
+        --data_dir "${DATA_DIR}" \
+        --data_type "beir" \
+        --top_k 100; then
+        echo "Successfully reranked ${DATASET} with CE reranker"
+    else
+        echo "Failed to rerank ${DATASET}" >&2
+        exit 1
+    fi
+done
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Set directories and model
+DATA_DIR="${REPO_DIR}/datasets/beir/"
+OUTPUT_DIR="${REPO_DIR}/outputs/beir/"
+MODEL_IN_USE="rryisthebest/First_Model"
+
+# Configuration flags
+USE_LOGITS=1  # Whether to use FIRST single token logit decoding
+USE_ALPHA=1   # Whether to use Alphabetic Identifiers
+
+# List of datasets to rerank
+DATASETS=('dbpedia-entity') # 'climate-fever' 'fever' 'hotpotqa' 'msmarco' 'nfcorpus' 'nq' 'fiqa' 'scidocs' 'scifact' 'trec-covid'
+
+# Iterate over datasets and rerank each one
+for DATASET in "${DATASETS[@]}"; do
+    echo "Reranking dataset: ${DATASET}"
+    
+    # Execute the rerank script with error handling
+    if python "${REPO_DIR}/scripts/rerank_llm.py" \
+        --model "${MODEL_IN_USE}" \
+        --dataset "${DATASET}" \
+        --output_dir "${OUTPUT_DIR}" \
+        --data_type "beir" \
+        --data_dir "${DATA_DIR}" \
+        --use_logits "${USE_LOGITS}" \
+        --use_alpha "${USE_ALPHA}" \
+        --llm_top_k 100 \
+        --window_size 20 \
+        --step_size 10 \
+        --do_batched 1; then
+        echo "Successfully reranked ${DATASET} with LLM reranker"
+    else
+        echo "Failed to rerank ${DATASET} with LLM reranker" >&2
+        exit 1
+    fi
+done
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Define model, dataset paths, and output directory
+BASE_MODEL="HuggingFaceH4/zephyr-7b-beta"
+TRAIN_DATA_PATH="rryisthebest/rank_zephyr_training_data_alpha"  # Train Dataset --> Hugging Face dataset or Local dataset
+EVAL_DATA_PATH="rryisthebest/evaluation_data_alpha"  # Eval Dataset --> Hugging Face dataset or Local dataset
+OUTPUT_DIR="${REPO_DIR}/models/ranking/FIRST_Model"  # Directory to save the trained model
+BEIR_DATA_DIR="${REPO_DIR}/datasets/beir/"
+
+# Launch training with DeepSpeed configuration
+accelerate launch --config_file "${REPO_DIR}/train_configs/accel_config_deepspeed.yaml" "${REPO_DIR}/scripts/train_ranking.py" \
+    --model_name_or_path "${BASE_MODEL}" \
+    --train_dataset_path "${TRAIN_DATA_PATH}" \
+    --eval_dataset_path "${EVAL_DATA_PATH}" \
+    --beir_data_path "${BEIR_DATA_DIR}" \
+    --per_device_eval_batch_size 1 \
+    --num_train_epochs 3 \
+    --seed 42 \
+    --per_device_train_batch_size 2 \
+    --eval_steps 400 \
+    --gradient_checkpointing \
+    --gradient_accumulation_steps 16 \
+    --lr_scheduler_type cosine \
+    --num_warmup_steps 50 \
+    --output_dir "${OUTPUT_DIR}" \
+    --noisy_embedding_alpha 5 \
+    --objective combined