Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions scripts/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,8 @@ scored_mining_dataset:
--export_path='openvalidators_dataset.csv' \
--export_mining_with_scoring_dataset \

openai_mining_dataset:
python3 data_collector.py \
--download_all \
--export_path='openvalidators_dataset.csv' \
--export_openai_dataset \
28 changes: 22 additions & 6 deletions scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ The repository's Makefile includes the following targets to facilitate data coll
make openvalidators_dataset
```

This command downloads all the runs from the project and exports them to a CSV file named
This command downloads all the runs from the latest version of the project and exports them to a CSV file named
**openvalidators_dataset.csv**.
It utilizes the following options under the hood of the [Makefile](Makefile):

Expand All @@ -76,8 +76,8 @@ This command downloads a specific run from the project and exports it to a CSV f
```bash
make mining_dataset
```
This command downloads all the runs from the project with a mining dataset and exports them to a CSV file named
**openvalidators_dataset.csv**. It utilizes the following options:
This command downloads all the runs from the latest version of the project with a mining dataset and exports them to a
CSV file named **openvalidators_dataset.csv**. It utilizes the following options:

- `--download_all`: Downloads all the runs.
- `--export_path`: Specifies the path and filename for the exported CSV file.
Expand All @@ -90,13 +90,23 @@ This command downloads all the runs from the project with a mining dataset and e
make scored_mining_dataset
```

This command downloads all the runs from the project with a scored mining dataset and exports them to a CSV file named
**openvalidators_dataset.csv**. It utilizes the following options:
This command downloads all the runs from the latest version of the project with a scored mining dataset and exports them
to a CSV file named **openvalidators_dataset.csv**. It utilizes the following options:

- `--download_all`: Downloads all the runs.
- `--export_path`: Specifies the path and filename for the exported CSV file.
- `--export_mining_with_scoring_dataset`: Enables the export of mining dataset with scoring.

---

### `openai_mining_dataset`
```bash
make openai_mining_dataset
```

This command downloads all the runs from the latest version of the project and exports them to jsonl file named
**openai_mining_dataset_openvalidators.jsonl** in the [openai fine-tuning format](https://platform.openai.com/docs/guides/fine-tuning/prepare-training-data).

Note: Feel completely free to adjust the [data_collector.py](data_collector.py) script and [Makefile](Makefile) as necessary to
match your project configuration and requirements.

Expand Down Expand Up @@ -146,7 +156,13 @@ By default, it is set to **"validator_dataset.csv"**. Example usage: `--export_p
- **--blacklist_path**: This parameter allows you to specify the path to a file containing blacklist phrases.
The script will exclude any data that contains these phrases. By default, it is set to [blacklist_phrases.txt](blacklist_phrases.txt).
Example usage: `--blacklist_path=blacklist_phrases.txt`.

- **--export_openai_dataset**: This parameter is a flag that, when set, enables the export of the mining dataset
in the [jsonl openai format for fine-tuning](https://platform.openai.com/docs/guides/fine-tuning):
```json lines
{"prompt": "base_prompt", "completion": "best_followup"},
{"prompt": "answer_prompt", "completion": "best_answer" }
...
```

Make sure to adjust the parameters accordingly when executing the [data_collector.py](data_collector.py) script for your
specific data collection needs.
89 changes: 18 additions & 71 deletions scripts/data_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,21 @@

import bittensor as bt
import argparse
import tqdm
import json
import pandas as pd
import openvalidators
import os
from analysis.utils import get_runs, download_data
from traceback import print_exc
from typing import List
from data_formatter import create_json_dataset, create_csv_dataset, create_openai_dataset


DEFAULT_PROJECT = 'opentensor-dev/openvalidators'
DEFAULT_FILTERS = {"tags": {"$in": [openvalidators.__version__]}}


def read_file_into_array(file_path: str) -> list[str]:
def read_file_into_array(file_path: str) -> List[str]:
"""Reads a file into an array of strings"""
bt.logging.info(f"Loading blacklists phrases from {file_path}")
with open(file_path, 'r') as file:
Expand Down Expand Up @@ -73,76 +74,13 @@ def collect_data(
return df


def create_json_dataset(
df: pd.DataFrame,
include_scoring: bool,
blacklist: list[str]
) -> dict:
dict_dataset = {}

for _, row in tqdm.tqdm(df.iterrows(), desc='Creating mining dataset', total=len(df), unit='run'):
base_prompt = row['base_prompt']
best_followup = row['best_followup']

answer_prompt = row['answer_prompt']
best_answer = row['best_answer']

if best_answer not in blacklist:
if include_scoring:
scores = 0
if isinstance(row["answer_rewards"], list):
scores = max(row["answer_rewards"])
elif isinstance(row["answer_rewards"], float):
scores = row["answer_rewards"]

dict_dataset[answer_prompt] = {best_answer: scores}
else:
dict_dataset[answer_prompt] = best_answer

if best_followup not in blacklist:
if include_scoring:
scores = 0
if isinstance(row["answer_rewards"], list):
scores = max(row["answer_rewards"])
elif isinstance(row["answer_rewards"], float):
scores = row["answer_rewards"]
dict_dataset[base_prompt] = {best_followup: scores}
else:
dict_dataset[base_prompt] = best_followup

return dict_dataset

def create_csv_dataset(
df: pd.DataFrame,
include_scoring: bool,
blacklist: list[str]
) -> pd.DataFrame:
if include_scoring:
mining_df = df[['base_prompt', 'best_followup', 'followup_rewards', 'answer_prompt', 'best_answer', 'answer_rewards']]
# Excludes blacklisted phrases from the dataset
filtered_df = mining_df[~df['best_followup'].isin(blacklist)]
filtered_df = filtered_df[~df['best_answer'].isin(blacklist)]

# Gets the max score for each answer and followup
filtered_df['followup_rewards'] = filtered_df['followup_rewards'].apply(lambda rewards: max(rewards))
filtered_df['answer_rewards'] = filtered_df['answer_rewards'].apply(lambda rewards: max(rewards))

return filtered_df
else:
mining_df = df[['base_prompt', 'best_followup', 'answer_prompt', 'best_answer']]
# Excludes blacklisted phrases from the dataset
filtered_df = mining_df[~df['best_followup'].isin(blacklist)]
filtered_df = filtered_df[~df['best_answer'].isin(blacklist)]

return filtered_df


def create_mining_dataset(
df: pd.DataFrame,
export_path: str,
mining_dataset_output_format: str,
blacklist_phrases: list[str],
with_score: bool =False):
blacklist_phrases: List[str],
with_score: bool =False,
export_openai_dataset: bool = False):
"""Creates a dataset for mining from the dataframe of wandb run logs.
Args:
df (pd.DataFrame): Dataframe of wandb run logs.
Expand All @@ -157,7 +95,13 @@ def create_mining_dataset(

bt.logging.info(f"Creating mining dataset: {mining_dataset_path}")

if mining_dataset_output_format == 'json':
if export_openai_dataset:
jsonl_dataset = create_openai_dataset(df=df, blacklist=blacklist_phrases)

with open("openai_mining_dataset_openvalidators.jsonl", "w") as file:
file.write(jsonl_dataset)

elif mining_dataset_output_format == 'json':
dict_dataset = create_json_dataset(
df=df,
include_scoring=with_score,
Expand Down Expand Up @@ -193,6 +137,7 @@ def create_mining_dataset(
parser.add_argument("--mining_dataset_output_format", type=str, help="Specify the output format of the mining dataset", default="json")
parser.add_argument("--export_path", type=str, help="Specify the path to export the dataset", default="validator_dataset.csv")
parser.add_argument("--blacklist_path", type=str, help="Specify the path to the blacklist phrases", default="blacklist_phrases.txt")
parser.add_argument("--export_openai_dataset", action="store_true", help="Exports the openai dataset", default=False)

args = parser.parse_args()

Expand All @@ -203,6 +148,7 @@ def create_mining_dataset(
export_mining_with_scoring_dataset = args.export_mining_with_scoring_dataset
export_path = args.export_path
mining_dataset_output_format = args.mining_dataset_output_format
export_openai_dataset = args.export_openai_dataset

bt.logging.info("Current version of openvalidators: " + openvalidators.__version__)

Expand All @@ -213,13 +159,14 @@ def create_mining_dataset(
collected_data = collect_data(download_all, export_path, wandb_run_id, include_tags)

# Creates mining dataset
if export_mining_dataset or export_mining_with_scoring_dataset:
if export_mining_dataset or export_mining_with_scoring_dataset or export_openai_dataset:
create_mining_dataset(
df=collected_data,
export_path=export_path,
mining_dataset_output_format=mining_dataset_output_format,
blacklist_phrases=blacklist_phrases,
with_score=export_mining_with_scoring_dataset
with_score=export_mining_with_scoring_dataset,
export_openai_dataset=export_openai_dataset
)
except Exception as e:
bt.logging.error("Error in training loop", str(e))
Expand Down
102 changes: 102 additions & 0 deletions scripts/data_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import pandas as pd
import tqdm
import json
from typing import List
from dataclasses import dataclass

@dataclass
class OpenAISample:
prompt: str
completion: str

def create_json_dataset(
df: pd.DataFrame,
include_scoring: bool,
blacklist: List[str]
) -> str:
dict_dataset = {}

for _, row in tqdm.tqdm(df.iterrows(), desc='Creating mining dataset', total=len(df), unit='run'):
base_prompt = row['base_prompt']
best_followup = row['best_followup']

answer_prompt = row['answer_prompt']
best_answer = row['best_answer']

if best_answer not in blacklist:
if include_scoring:
scores = 0
if isinstance(row["answer_rewards"], list):
scores = max(row["answer_rewards"])
elif isinstance(row["answer_rewards"], float):
scores = row["answer_rewards"]

dict_dataset[answer_prompt] = {best_answer: scores}
else:
dict_dataset[answer_prompt] = best_answer

if best_followup not in blacklist:
if include_scoring:
scores = 0
if isinstance(row["answer_rewards"], list):
scores = max(row["answer_rewards"])
elif isinstance(row["answer_rewards"], float):
scores = row["answer_rewards"]
dict_dataset[base_prompt] = {best_followup: scores}
else:
dict_dataset[base_prompt] = best_followup

return dict_dataset


def create_csv_dataset(
df: pd.DataFrame,
include_scoring: bool,
blacklist: List[str]
) -> pd.DataFrame:
if include_scoring:
mining_df = df[['base_prompt', 'best_followup', 'followup_rewards', 'answer_prompt', 'best_answer', 'answer_rewards']]
# Excludes blacklisted phrases from the dataset
filtered_df = mining_df[~df['best_followup'].isin(blacklist)]
filtered_df = filtered_df[~df['best_answer'].isin(blacklist)]

# Gets the max score for each answer and followup
filtered_df['followup_rewards'] = filtered_df['followup_rewards'].apply(lambda rewards: max(rewards))
filtered_df['answer_rewards'] = filtered_df['answer_rewards'].apply(lambda rewards: max(rewards))

return filtered_df
else:
mining_df = df[['base_prompt', 'best_followup', 'answer_prompt', 'best_answer']]
# Excludes blacklisted phrases from the dataset
filtered_df = mining_df[~df['best_followup'].isin(blacklist)]
filtered_df = filtered_df[~df['best_answer'].isin(blacklist)]

return filtered_df


def create_openai_dataset(
df: pd.DataFrame,
blacklist: List[str]
) -> str:
samples = []

for _, row in tqdm.tqdm(df.iterrows(), desc='Creating openai mining dataset', total=len(df), unit='run'):
base_prompt = row['base_prompt']
best_followup = row['best_followup']

answer_prompt = row['answer_prompt']
best_answer = row['best_answer']

if best_followup not in blacklist:
samples += [OpenAISample(base_prompt, best_followup)]

if best_answer not in blacklist:
samples += [OpenAISample(answer_prompt, best_answer)]

# Convert dataclass objects to dictionaries
jsonl_data = "\n".join(
json.dumps({"prompt": sample.prompt, "completion": sample.completion})
for sample in samples
)

return jsonl_data