diff --git a/scripts/Makefile b/scripts/Makefile index caa53b3..76b9643 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -24,3 +24,8 @@ scored_mining_dataset: --export_path='openvalidators_dataset.csv' \ --export_mining_with_scoring_dataset \ +openai_mining_dataset: + python3 data_collector.py \ + --download_all \ + --export_path='openvalidators_dataset.csv' \ + --export_openai_dataset \ diff --git a/scripts/README.md b/scripts/README.md index 7e5aa56..05bbd8c 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -51,7 +51,7 @@ The repository's Makefile includes the following targets to facilitate data coll make openvalidators_dataset ``` -This command downloads all the runs from the project and exports them to a CSV file named +This command downloads all the runs from the latest version of the project and exports them to a CSV file named **openvalidators_dataset.csv**. It utilizes the following options under the hood of the [Makefile](Makefile): @@ -76,8 +76,8 @@ This command downloads a specific run from the project and exports it to a CSV f ```bash make mining_dataset ``` -This command downloads all the runs from the project with a mining dataset and exports them to a CSV file named -**openvalidators_dataset.csv**. It utilizes the following options: +This command downloads all the runs from the latest version of the project with a mining dataset and exports them to a +CSV file named **openvalidators_dataset.csv**. It utilizes the following options: - `--download_all`: Downloads all the runs. - `--export_path`: Specifies the path and filename for the exported CSV file. @@ -90,13 +90,23 @@ This command downloads all the runs from the project with a mining dataset and e make scored_mining_dataset ``` -This command downloads all the runs from the project with a scored mining dataset and exports them to a CSV file named -**openvalidators_dataset.csv**. It utilizes the following options: +This command downloads all the runs from the latest version of the project with a scored mining dataset and exports them +to a CSV file named **openvalidators_dataset.csv**. It utilizes the following options: - `--download_all`: Downloads all the runs. - `--export_path`: Specifies the path and filename for the exported CSV file. - `--export_mining_with_scoring_dataset`: Enables the export of mining dataset with scoring. +--- + +### `openai_mining_dataset` +```bash +make openai_mining_dataset +``` + +This command downloads all the runs from the latest version of the project and exports them to jsonl file named +**openai_mining_dataset_openvalidators.jsonl** in the [openai fine-tuning format](https://platform.openai.com/docs/guides/fine-tuning/prepare-training-data). + Note: Feel completely free to adjust the [data_collector.py](data_collector.py) script and [Makefile](Makefile) as necessary to match your project configuration and requirements. @@ -146,7 +156,13 @@ By default, it is set to **"validator_dataset.csv"**. Example usage: `--export_p - **--blacklist_path**: This parameter allows you to specify the path to a file containing blacklist phrases. The script will exclude any data that contains these phrases. By default, it is set to [blacklist_phrases.txt](blacklist_phrases.txt). Example usage: `--blacklist_path=blacklist_phrases.txt`. - +- **--export_openai_dataset**: This parameter is a flag that, when set, enables the export of the mining dataset +in the [jsonl openai format for fine-tuning](https://platform.openai.com/docs/guides/fine-tuning): + ```json lines + {"prompt": "base_prompt", "completion": "best_followup"}, + {"prompt": "answer_prompt", "completion": "best_answer" } + ... + ``` Make sure to adjust the parameters accordingly when executing the [data_collector.py](data_collector.py) script for your specific data collection needs. \ No newline at end of file diff --git a/scripts/data_collector.py b/scripts/data_collector.py index 09cda73..24c1563 100644 --- a/scripts/data_collector.py +++ b/scripts/data_collector.py @@ -17,20 +17,21 @@ import bittensor as bt import argparse -import tqdm import json import pandas as pd import openvalidators import os from analysis.utils import get_runs, download_data from traceback import print_exc +from typing import List +from data_formatter import create_json_dataset, create_csv_dataset, create_openai_dataset DEFAULT_PROJECT = 'opentensor-dev/openvalidators' DEFAULT_FILTERS = {"tags": {"$in": [openvalidators.__version__]}} -def read_file_into_array(file_path: str) -> list[str]: +def read_file_into_array(file_path: str) -> List[str]: """Reads a file into an array of strings""" bt.logging.info(f"Loading blacklists phrases from {file_path}") with open(file_path, 'r') as file: @@ -73,76 +74,13 @@ def collect_data( return df -def create_json_dataset( - df: pd.DataFrame, - include_scoring: bool, - blacklist: list[str] -) -> dict: - dict_dataset = {} - - for _, row in tqdm.tqdm(df.iterrows(), desc='Creating mining dataset', total=len(df), unit='run'): - base_prompt = row['base_prompt'] - best_followup = row['best_followup'] - - answer_prompt = row['answer_prompt'] - best_answer = row['best_answer'] - - if best_answer not in blacklist: - if include_scoring: - scores = 0 - if isinstance(row["answer_rewards"], list): - scores = max(row["answer_rewards"]) - elif isinstance(row["answer_rewards"], float): - scores = row["answer_rewards"] - - dict_dataset[answer_prompt] = {best_answer: scores} - else: - dict_dataset[answer_prompt] = best_answer - - if best_followup not in blacklist: - if include_scoring: - scores = 0 - if isinstance(row["answer_rewards"], list): - scores = max(row["answer_rewards"]) - elif isinstance(row["answer_rewards"], float): - scores = row["answer_rewards"] - dict_dataset[base_prompt] = {best_followup: scores} - else: - dict_dataset[base_prompt] = best_followup - - return dict_dataset - -def create_csv_dataset( - df: pd.DataFrame, - include_scoring: bool, - blacklist: list[str] -) -> pd.DataFrame: - if include_scoring: - mining_df = df[['base_prompt', 'best_followup', 'followup_rewards', 'answer_prompt', 'best_answer', 'answer_rewards']] - # Excludes blacklisted phrases from the dataset - filtered_df = mining_df[~df['best_followup'].isin(blacklist)] - filtered_df = filtered_df[~df['best_answer'].isin(blacklist)] - - # Gets the max score for each answer and followup - filtered_df['followup_rewards'] = filtered_df['followup_rewards'].apply(lambda rewards: max(rewards)) - filtered_df['answer_rewards'] = filtered_df['answer_rewards'].apply(lambda rewards: max(rewards)) - - return filtered_df - else: - mining_df = df[['base_prompt', 'best_followup', 'answer_prompt', 'best_answer']] - # Excludes blacklisted phrases from the dataset - filtered_df = mining_df[~df['best_followup'].isin(blacklist)] - filtered_df = filtered_df[~df['best_answer'].isin(blacklist)] - - return filtered_df - - def create_mining_dataset( df: pd.DataFrame, export_path: str, mining_dataset_output_format: str, - blacklist_phrases: list[str], - with_score: bool =False): + blacklist_phrases: List[str], + with_score: bool =False, + export_openai_dataset: bool = False): """Creates a dataset for mining from the dataframe of wandb run logs. Args: df (pd.DataFrame): Dataframe of wandb run logs. @@ -157,7 +95,13 @@ def create_mining_dataset( bt.logging.info(f"Creating mining dataset: {mining_dataset_path}") - if mining_dataset_output_format == 'json': + if export_openai_dataset: + jsonl_dataset = create_openai_dataset(df=df, blacklist=blacklist_phrases) + + with open("openai_mining_dataset_openvalidators.jsonl", "w") as file: + file.write(jsonl_dataset) + + elif mining_dataset_output_format == 'json': dict_dataset = create_json_dataset( df=df, include_scoring=with_score, @@ -193,6 +137,7 @@ def create_mining_dataset( parser.add_argument("--mining_dataset_output_format", type=str, help="Specify the output format of the mining dataset", default="json") parser.add_argument("--export_path", type=str, help="Specify the path to export the dataset", default="validator_dataset.csv") parser.add_argument("--blacklist_path", type=str, help="Specify the path to the blacklist phrases", default="blacklist_phrases.txt") + parser.add_argument("--export_openai_dataset", action="store_true", help="Exports the openai dataset", default=False) args = parser.parse_args() @@ -203,6 +148,7 @@ def create_mining_dataset( export_mining_with_scoring_dataset = args.export_mining_with_scoring_dataset export_path = args.export_path mining_dataset_output_format = args.mining_dataset_output_format + export_openai_dataset = args.export_openai_dataset bt.logging.info("Current version of openvalidators: " + openvalidators.__version__) @@ -213,13 +159,14 @@ def create_mining_dataset( collected_data = collect_data(download_all, export_path, wandb_run_id, include_tags) # Creates mining dataset - if export_mining_dataset or export_mining_with_scoring_dataset: + if export_mining_dataset or export_mining_with_scoring_dataset or export_openai_dataset: create_mining_dataset( df=collected_data, export_path=export_path, mining_dataset_output_format=mining_dataset_output_format, blacklist_phrases=blacklist_phrases, - with_score=export_mining_with_scoring_dataset + with_score=export_mining_with_scoring_dataset, + export_openai_dataset=export_openai_dataset ) except Exception as e: bt.logging.error("Error in training loop", str(e)) diff --git a/scripts/data_formatter.py b/scripts/data_formatter.py new file mode 100644 index 0000000..298b580 --- /dev/null +++ b/scripts/data_formatter.py @@ -0,0 +1,102 @@ +import pandas as pd +import tqdm +import json +from typing import List +from dataclasses import dataclass + +@dataclass +class OpenAISample: + prompt: str + completion: str + +def create_json_dataset( + df: pd.DataFrame, + include_scoring: bool, + blacklist: List[str] +) -> str: + dict_dataset = {} + + for _, row in tqdm.tqdm(df.iterrows(), desc='Creating mining dataset', total=len(df), unit='run'): + base_prompt = row['base_prompt'] + best_followup = row['best_followup'] + + answer_prompt = row['answer_prompt'] + best_answer = row['best_answer'] + + if best_answer not in blacklist: + if include_scoring: + scores = 0 + if isinstance(row["answer_rewards"], list): + scores = max(row["answer_rewards"]) + elif isinstance(row["answer_rewards"], float): + scores = row["answer_rewards"] + + dict_dataset[answer_prompt] = {best_answer: scores} + else: + dict_dataset[answer_prompt] = best_answer + + if best_followup not in blacklist: + if include_scoring: + scores = 0 + if isinstance(row["answer_rewards"], list): + scores = max(row["answer_rewards"]) + elif isinstance(row["answer_rewards"], float): + scores = row["answer_rewards"] + dict_dataset[base_prompt] = {best_followup: scores} + else: + dict_dataset[base_prompt] = best_followup + + return dict_dataset + + +def create_csv_dataset( + df: pd.DataFrame, + include_scoring: bool, + blacklist: List[str] +) -> pd.DataFrame: + if include_scoring: + mining_df = df[['base_prompt', 'best_followup', 'followup_rewards', 'answer_prompt', 'best_answer', 'answer_rewards']] + # Excludes blacklisted phrases from the dataset + filtered_df = mining_df[~df['best_followup'].isin(blacklist)] + filtered_df = filtered_df[~df['best_answer'].isin(blacklist)] + + # Gets the max score for each answer and followup + filtered_df['followup_rewards'] = filtered_df['followup_rewards'].apply(lambda rewards: max(rewards)) + filtered_df['answer_rewards'] = filtered_df['answer_rewards'].apply(lambda rewards: max(rewards)) + + return filtered_df + else: + mining_df = df[['base_prompt', 'best_followup', 'answer_prompt', 'best_answer']] + # Excludes blacklisted phrases from the dataset + filtered_df = mining_df[~df['best_followup'].isin(blacklist)] + filtered_df = filtered_df[~df['best_answer'].isin(blacklist)] + + return filtered_df + + +def create_openai_dataset( + df: pd.DataFrame, + blacklist: List[str] +) -> str: + samples = [] + + for _, row in tqdm.tqdm(df.iterrows(), desc='Creating openai mining dataset', total=len(df), unit='run'): + base_prompt = row['base_prompt'] + best_followup = row['best_followup'] + + answer_prompt = row['answer_prompt'] + best_answer = row['best_answer'] + + if best_followup not in blacklist: + samples += [OpenAISample(base_prompt, best_followup)] + + if best_answer not in blacklist: + samples += [OpenAISample(answer_prompt, best_answer)] + + # Convert dataclass objects to dictionaries + jsonl_data = "\n".join( + json.dumps({"prompt": sample.prompt, "completion": sample.completion}) + for sample in samples + ) + + return jsonl_data \ No newline at end of file