diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index 59e2c4548365..d5be04ab9f44 100644
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -413,7 +413,7 @@ You may contact us or participate in the following ways:
1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
3. Join the Colossal-AI community on
- [Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
+ [Slack](https://github.com/hpcaitech/public_assets/tree/main/colossalai/contact/slack),
and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
4. Send your official proposal to email contact@hpcaitech.com
diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 90471ed727b0..0d0e2a7d34f5 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -27,7 +27,7 @@ def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
def preprocess_batch(samples) -> dict:
input_ids = torch.stack(samples)
attention_mask = torch.ones_like(input_ids, dtype=torch.long)
- return {'input_ids': input_ids, 'attention_mask': attention_mask}
+ return {"input_ids": input_ids, "attention_mask": attention_mask}
def print_rank_0(*args, **kwargs) -> None:
@@ -39,32 +39,32 @@ def print_model_numel(model_dict: dict) -> None:
B = 1024**3
M = 1024**2
K = 1024
- outputs = ''
+ outputs = ""
for name, numel in model_dict.items():
- outputs += f'{name}: '
+ outputs += f"{name}: "
if numel >= B:
- outputs += f'{numel / B:.2f} B\n'
+ outputs += f"{numel / B:.2f} B\n"
elif numel >= M:
- outputs += f'{numel / M:.2f} M\n'
+ outputs += f"{numel / M:.2f} M\n"
elif numel >= K:
- outputs += f'{numel / K:.2f} K\n'
+ outputs += f"{numel / K:.2f} K\n"
else:
- outputs += f'{numel}\n'
+ outputs += f"{numel}\n"
print_rank_0(outputs)
def get_gpt_config(model_name: str) -> OPTConfig:
model_map = {
- '125m': OPTConfig.from_pretrained('facebook/opt-125m'),
- '350m': OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
- '700m': OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
- '1.3b': OPTConfig.from_pretrained('facebook/opt-1.3b'),
- '2.7b': OPTConfig.from_pretrained('facebook/opt-2.7b'),
- '3.5b': OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
- '5.5b': OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
- '6.7b': OPTConfig.from_pretrained('facebook/opt-6.7b'),
- '10b': OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
- '13b': OPTConfig.from_pretrained('facebook/opt-13b'),
+ "125m": OPTConfig.from_pretrained("facebook/opt-125m"),
+ "350m": OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
+ "700m": OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
+ "1.3b": OPTConfig.from_pretrained("facebook/opt-1.3b"),
+ "2.7b": OPTConfig.from_pretrained("facebook/opt-2.7b"),
+ "3.5b": OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
+ "5.5b": OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
+ "6.7b": OPTConfig.from_pretrained("facebook/opt-6.7b"),
+ "10b": OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
+ "13b": OPTConfig.from_pretrained("facebook/opt-13b"),
}
try:
return model_map[model_name]
@@ -73,20 +73,20 @@ def get_gpt_config(model_name: str) -> OPTConfig:
def main(args):
- if args.strategy == 'ddp':
+ if args.strategy == "ddp":
strategy = DDPStrategy()
- elif args.strategy == 'colossalai_gemini':
- strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
- elif args.strategy == 'colossalai_gemini_cpu':
- strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
- elif args.strategy == 'colossalai_zero2':
- strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
- elif args.strategy == 'colossalai_zero2_cpu':
- strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
- elif args.strategy == 'colossalai_zero1':
- strategy = LowLevelZeroStrategy(stage=1, placement_policy='cuda')
- elif args.strategy == 'colossalai_zero1_cpu':
- strategy = LowLevelZeroStrategy(stage=1, placement_policy='cpu')
+ elif args.strategy == "colossalai_gemini":
+ strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5)
+ elif args.strategy == "colossalai_gemini_cpu":
+ strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+ elif args.strategy == "colossalai_zero2":
+ strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
+ elif args.strategy == "colossalai_zero2_cpu":
+ strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
+ elif args.strategy == "colossalai_zero1":
+ strategy = LowLevelZeroStrategy(stage=1, placement_policy="cuda")
+ elif args.strategy == "colossalai_zero1_cpu":
+ strategy = LowLevelZeroStrategy(stage=1, placement_policy="cpu")
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
@@ -103,90 +103,106 @@ def main(args):
if args.use_kernels:
from coati.kernels import convert_to_xformer_model
- actor, critic, initial_model, reward_model = map(convert_to_xformer_model,
- (actor, critic, initial_model, reward_model))
+
+ actor, critic, initial_model, reward_model = map(
+ convert_to_xformer_model, (actor, critic, initial_model, reward_model)
+ )
actor_numel = get_model_numel(actor, strategy)
critic_numel = get_model_numel(critic, strategy)
initial_model_numel = get_model_numel(initial_model, strategy)
reward_model_numel = get_model_numel(reward_model, strategy)
- print_model_numel({
- 'Actor': actor_numel,
- 'Critic': critic_numel,
- 'Initial model': initial_model_numel,
- 'Reward model': reward_model_numel
- })
- performance_evaluator = PerformanceEvaluator(actor_numel,
- critic_numel,
- initial_model_numel,
- reward_model_numel,
- enable_grad_checkpoint=False,
- ignore_episodes=1)
-
- if args.strategy.startswith('colossalai'):
+ print_model_numel(
+ {
+ "Actor": actor_numel,
+ "Critic": critic_numel,
+ "Initial model": initial_model_numel,
+ "Reward model": reward_model_numel,
+ }
+ )
+ performance_evaluator = PerformanceEvaluator(
+ actor_numel,
+ critic_numel,
+ initial_model_numel,
+ reward_model_numel,
+ enable_grad_checkpoint=False,
+ ignore_episodes=1,
+ )
+
+ if args.strategy.startswith("colossalai"):
actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
else:
actor_optim = Adam(actor.parameters(), lr=5e-6)
critic_optim = Adam(critic.parameters(), lr=5e-6)
- tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
+ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
tokenizer.pad_token = tokenizer.eos_token
+ tokenizer.padding_side = "left"
(actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
random_prompts = torch.randint(tokenizer.vocab_size, (1000, 256), device=torch.cuda.current_device())
- dataloader = DataLoader(random_prompts,
- batch_size=args.experience_batch_size,
- shuffle=True,
- collate_fn=preprocess_batch)
-
- trainer = PPOTrainer(strategy,
- actor,
- critic,
- reward_model,
- initial_model,
- actor_optim,
- critic_optim,
- ptx_coef=0,
- train_batch_size=args.train_batch_size,
- offload_inference_models=args.offload_inference_models,
- max_length=512,
- do_sample=True,
- temperature=1.0,
- top_k=50,
- use_cache=True,
- pad_token_id=tokenizer.pad_token_id,
- eos_token_id=tokenizer.eos_token_id,
- callbacks=[performance_evaluator])
-
- trainer.fit(prompt_dataloader=dataloader,
- pretrain_dataloader=None,
- num_episodes=args.num_episodes,
- num_update_steps=args.num_update_steps,
- num_collect_steps=args.num_collect_steps)
-
- print_rank_0(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB')
-
-
-if __name__ == '__main__':
+ dataloader = DataLoader(
+ random_prompts, batch_size=args.experience_batch_size, shuffle=True, collate_fn=preprocess_batch
+ )
+
+ trainer = PPOTrainer(
+ strategy,
+ actor,
+ critic,
+ reward_model,
+ initial_model,
+ actor_optim,
+ critic_optim,
+ tokenizer=tokenizer,
+ ptx_coef=0,
+ train_batch_size=args.train_batch_size,
+ offload_inference_models=args.offload_inference_models,
+ max_length=512,
+ do_sample=True,
+ temperature=1.0,
+ top_k=50,
+ use_cache=True,
+ callbacks=[performance_evaluator],
+ )
+
+ trainer.fit(
+ prompt_dataloader=dataloader,
+ pretrain_dataloader=None,
+ num_episodes=args.num_episodes,
+ num_update_steps=args.num_update_steps,
+ num_collect_steps=args.num_collect_steps,
+ )
+
+ print_rank_0(f"Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
+
+
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--model', default='125m')
- parser.add_argument('--critic_model', default='125m')
- parser.add_argument('--strategy',
- choices=[
- 'ddp', 'colossalai_gemini', 'colossalai_gemini_cpu', 'colossalai_zero2',
- 'colossalai_zero2_cpu', 'colossalai_zero1', 'colossalai_zero1_cpu'
- ],
- default='ddp')
- parser.add_argument('--num_episodes', type=int, default=3)
- parser.add_argument('--num_collect_steps', type=int, default=8)
- parser.add_argument('--num_update_steps', type=int, default=1)
- parser.add_argument('--train_batch_size', type=int, default=8)
- parser.add_argument('--experience_batch_size', type=int, default=8)
- parser.add_argument('--lora_rank', type=int, default=0)
- parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
- parser.add_argument('--offload_inference_models', action='store_true', default=False)
- parser.add_argument('--use_kernels', action='store_true', default=False)
+ parser.add_argument("--model", default="125m")
+ parser.add_argument("--critic_model", default="125m")
+ parser.add_argument(
+ "--strategy",
+ choices=[
+ "ddp",
+ "colossalai_gemini",
+ "colossalai_gemini_cpu",
+ "colossalai_zero2",
+ "colossalai_zero2_cpu",
+ "colossalai_zero1",
+ "colossalai_zero1_cpu",
+ ],
+ default="ddp",
+ )
+ parser.add_argument("--num_episodes", type=int, default=3)
+ parser.add_argument("--num_collect_steps", type=int, default=8)
+ parser.add_argument("--num_update_steps", type=int, default=1)
+ parser.add_argument("--train_batch_size", type=int, default=8)
+ parser.add_argument("--experience_batch_size", type=int, default=8)
+ parser.add_argument("--lora_rank", type=int, default=0)
+ parser.add_argument("--cuda_mem_frac", type=float, default=1.0)
+ parser.add_argument("--offload_inference_models", action="store_true", default=False)
+ parser.add_argument("--use_kernels", action="store_true", default=False)
args = parser.parse_args()
main(args)
diff --git a/applications/Chat/benchmarks/ray/1mmt_dummy.py b/applications/Chat/benchmarks/ray/1mmt_dummy.py
index 7fc990448805..98ace3869450 100644
--- a/applications/Chat/benchmarks/ray/1mmt_dummy.py
+++ b/applications/Chat/benchmarks/ray/1mmt_dummy.py
@@ -22,13 +22,13 @@
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
- s.bind(('', 0))
+ s.bind(("", 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
- s.connect(('8.8.8.8', 80))
+ s.connect(("8.8.8.8", 80))
return s.getsockname()[0]
@@ -36,22 +36,25 @@ def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
- env_info_trainers = [{
- 'local_rank': '0',
- 'rank': str(rank),
- 'world_size': str(args.num_trainers),
- 'master_port': trainer_port,
- 'master_addr': master_addr
- } for rank in range(args.num_trainers)]
+ env_info_trainers = [
+ {
+ "local_rank": "0",
+ "rank": str(rank),
+ "world_size": str(args.num_trainers),
+ "master_port": trainer_port,
+ "master_addr": master_addr,
+ }
+ for rank in range(args.num_trainers)
+ ]
# maker_env_info
maker_port = str(get_free_port())
env_info_maker = {
- 'local_rank': '0',
- 'rank': '0',
- 'world_size': '1',
- 'master_port': maker_port,
- 'master_addr': master_addr
+ "local_rank": "0",
+ "rank": "0",
+ "world_size": "1",
+ "master_port": maker_port,
+ "master_addr": master_addr,
}
# configure tokenizer
@@ -63,21 +66,27 @@ def model_fn():
critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
- reward_model = get_reward_model_from_args(args.critic_model,
- config=critic_cfg).requires_grad_(False).half().cuda()
- if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+ reward_model = (
+ get_reward_model_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
+ )
+ if args.initial_model_quant_ckpt is not None and args.model == "llama":
# quantize initial model
with low_resource_init(), no_init_weights():
initial_model = get_actor_from_args(args.model, config=actor_cfg)
- initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
- args.quant_group_size).cuda().requires_grad_(False)
+ initial_model.model = (
+ llama_load_quant(
+ initial_model.model, args.initial_model_quant_ckpt, args.quant_bits, args.quant_group_size
+ )
+ .cuda()
+ .requires_grad_(False)
+ )
else:
initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
return actor, critic, reward_model, initial_model
# configure Experience Maker
experience_holder_ref = ExperienceMakerHolder.options(name="maker0", num_gpus=1, max_concurrency=2).remote(
- detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
+ detached_trainer_name_list=[f"trainer{i}" for i in range(args.num_trainers)],
strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
model_fn=model_fn,
env_info=env_info_maker,
@@ -97,15 +106,18 @@ def model_fn():
def trainer_model_fn():
actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
- critic = get_critic_from_args(args.critic_model,
- config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
+ critic = (
+ get_critic_from_args(args.critic_model, config=AutoConfig.from_pretrained(args.critic_pretrain))
+ .half()
+ .cuda()
+ )
return actor, critic
# configure Trainer
trainer_refs = [
DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
experience_maker_holder_name_list=[
- f'maker{x}' for x in get_receivers_per_sender(i, args.num_trainers, 1, allow_idle_sender=True)
+ f"maker{x}" for x in get_receivers_per_sender(i, args.num_trainers, 1, allow_idle_sender=True)
],
strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
model_fn=trainer_model_fn,
@@ -114,7 +126,8 @@ def trainer_model_fn():
buffer_limit=16,
eval_performance=True,
debug=args.debug,
- ) for i, env_info_trainer in enumerate(env_info_trainers)
+ )
+ for i, env_info_trainer in enumerate(env_info_trainers)
]
dataset_size = args.experience_batch_size * 4
@@ -122,7 +135,7 @@ def trainer_model_fn():
def data_gen_fn():
input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
attn_mask = torch.ones_like(input_ids)
- return {'input_ids': input_ids, 'attention_mask': attn_mask}
+ return {"input_ids": input_ids, "attention_mask": attn_mask}
def build_dataloader(size):
dataset = [data_gen_fn() for _ in range(size)]
@@ -138,8 +151,10 @@ def build_dataloader(size):
wait_tasks = []
wait_tasks.append(
- experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
- num_steps=args.experience_steps))
+ experience_holder_ref.workingloop.remote(
+ partial(build_dataloader, dataset_size), num_steps=args.experience_steps
+ )
+ )
total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
for trainer_ref in trainer_refs:
@@ -148,31 +163,30 @@ def build_dataloader(size):
ray.get(wait_tasks)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--num_trainers', type=int, default=1)
- parser.add_argument('--trainer_strategy',
- choices=[
- 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
- 'colossalai_zero2_cpu'
- ],
- default='ddp')
- parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
- parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--critic_pretrain', type=str, default=None)
- parser.add_argument('--experience_steps', type=int, default=4)
- parser.add_argument('--experience_batch_size', type=int, default=8)
- parser.add_argument('--train_epochs', type=int, default=1)
- parser.add_argument('--update_steps', type=int, default=2)
- parser.add_argument('--train_batch_size', type=int, default=8)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
- parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
- parser.add_argument('--quant_bits', type=int, default=4)
- parser.add_argument('--quant_group_size', type=int, default=128)
- parser.add_argument('--debug', action='store_true')
+ parser.add_argument("--num_trainers", type=int, default=1)
+ parser.add_argument(
+ "--trainer_strategy",
+ choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_gemini_cpu", "colossalai_zero2_cpu"],
+ default="ddp",
+ )
+ parser.add_argument("--maker_strategy", choices=["naive"], default="naive")
+ parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--critic_model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--critic_pretrain", type=str, default=None)
+ parser.add_argument("--experience_steps", type=int, default=4)
+ parser.add_argument("--experience_batch_size", type=int, default=8)
+ parser.add_argument("--train_epochs", type=int, default=1)
+ parser.add_argument("--update_steps", type=int, default=2)
+ parser.add_argument("--train_batch_size", type=int, default=8)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+
+ parser.add_argument("--initial_model_quant_ckpt", type=str, default=None)
+ parser.add_argument("--quant_bits", type=int, default=4)
+ parser.add_argument("--quant_group_size", type=int, default=128)
+ parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
main(args)
diff --git a/applications/Chat/benchmarks/ray/mmmt_dummy.py b/applications/Chat/benchmarks/ray/mmmt_dummy.py
index ca1df22070fc..f8860f2979ee 100644
--- a/applications/Chat/benchmarks/ray/mmmt_dummy.py
+++ b/applications/Chat/benchmarks/ray/mmmt_dummy.py
@@ -22,13 +22,13 @@
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
- s.bind(('', 0))
+ s.bind(("", 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
- s.connect(('8.8.8.8', 80))
+ s.connect(("8.8.8.8", 80))
return s.getsockname()[0]
@@ -36,23 +36,29 @@ def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
- env_info_trainers = [{
- 'local_rank': '0',
- 'rank': str(rank),
- 'world_size': str(args.num_trainers),
- 'master_port': trainer_port,
- 'master_addr': master_addr
- } for rank in range(args.num_trainers)]
+ env_info_trainers = [
+ {
+ "local_rank": "0",
+ "rank": str(rank),
+ "world_size": str(args.num_trainers),
+ "master_port": trainer_port,
+ "master_addr": master_addr,
+ }
+ for rank in range(args.num_trainers)
+ ]
# maker_env_info
maker_port = str(get_free_port())
- env_info_makers = [{
- 'local_rank': '0',
- 'rank': str(rank),
- 'world_size': str(args.num_makers),
- 'master_port': maker_port,
- 'master_addr': master_addr
- } for rank in range(args.num_makers)]
+ env_info_makers = [
+ {
+ "local_rank": "0",
+ "rank": str(rank),
+ "world_size": str(args.num_makers),
+ "master_port": maker_port,
+ "master_addr": master_addr,
+ }
+ for rank in range(args.num_makers)
+ ]
# configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
@@ -63,14 +69,20 @@ def model_fn():
critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
- reward_model = get_reward_model_from_args(args.critic_model,
- config=critic_cfg).requires_grad_(False).half().cuda()
- if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+ reward_model = (
+ get_reward_model_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
+ )
+ if args.initial_model_quant_ckpt is not None and args.model == "llama":
# quantize initial model
with low_resource_init(), no_init_weights():
initial_model = get_actor_from_args(args.model, config=actor_cfg)
- initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
- args.quant_group_size).cuda().requires_grad_(False)
+ initial_model.model = (
+ llama_load_quant(
+ initial_model.model, args.initial_model_quant_ckpt, args.quant_bits, args.quant_group_size
+ )
+ .cuda()
+ .requires_grad_(False)
+ )
else:
initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
return actor, critic, reward_model, initial_model
@@ -79,7 +91,7 @@ def model_fn():
experience_holder_refs = [
ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=[
- f'trainer{x}'
+ f"trainer{x}"
for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
],
strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
@@ -103,8 +115,11 @@ def model_fn():
def trainer_model_fn():
actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
- critic = get_critic_from_args(args.critic_model,
- config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
+ critic = (
+ get_critic_from_args(args.critic_model, config=AutoConfig.from_pretrained(args.critic_pretrain))
+ .half()
+ .cuda()
+ )
return actor, critic
# configure Trainer
@@ -130,7 +145,7 @@ def trainer_model_fn():
def data_gen_fn():
input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
attn_mask = torch.ones_like(input_ids)
- return {'input_ids': input_ids, 'attention_mask': attn_mask}
+ return {"input_ids": input_ids, "attention_mask": attn_mask}
def build_dataloader(size):
dataset = [data_gen_fn() for _ in range(size)]
@@ -147,43 +162,48 @@ def build_dataloader(size):
for experience_holder_ref in experience_holder_refs:
wait_tasks.append(
- experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
- num_steps=args.experience_steps))
+ experience_holder_ref.workingloop.remote(
+ partial(build_dataloader, dataset_size), num_steps=args.experience_steps
+ )
+ )
- total_steps = args.experience_batch_size * args.experience_steps * \
- args.num_makers // (args.num_trainers * args.train_batch_size)
+ total_steps = (
+ args.experience_batch_size
+ * args.experience_steps
+ * args.num_makers
+ // (args.num_trainers * args.train_batch_size)
+ )
for trainer_ref in trainer_refs:
wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
ray.get(wait_tasks)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--num_makers', type=int, default=1)
- parser.add_argument('--num_trainers', type=int, default=1)
- parser.add_argument('--trainer_strategy',
- choices=[
- 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
- 'colossalai_zero2_cpu'
- ],
- default='ddp')
- parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
- parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--critic_pretrain', type=str, default=None)
- parser.add_argument('--experience_steps', type=int, default=4)
- parser.add_argument('--experience_batch_size', type=int, default=8)
- parser.add_argument('--train_epochs', type=int, default=1)
- parser.add_argument('--update_steps', type=int, default=2)
- parser.add_argument('--train_batch_size', type=int, default=8)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
- parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
- parser.add_argument('--quant_bits', type=int, default=4)
- parser.add_argument('--quant_group_size', type=int, default=128)
- parser.add_argument('--debug', action='store_true')
+ parser.add_argument("--num_makers", type=int, default=1)
+ parser.add_argument("--num_trainers", type=int, default=1)
+ parser.add_argument(
+ "--trainer_strategy",
+ choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_gemini_cpu", "colossalai_zero2_cpu"],
+ default="ddp",
+ )
+ parser.add_argument("--maker_strategy", choices=["naive"], default="naive")
+ parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--critic_model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--critic_pretrain", type=str, default=None)
+ parser.add_argument("--experience_steps", type=int, default=4)
+ parser.add_argument("--experience_batch_size", type=int, default=8)
+ parser.add_argument("--train_epochs", type=int, default=1)
+ parser.add_argument("--update_steps", type=int, default=2)
+ parser.add_argument("--train_batch_size", type=int, default=8)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+
+ parser.add_argument("--initial_model_quant_ckpt", type=str, default=None)
+ parser.add_argument("--quant_bits", type=int, default=4)
+ parser.add_argument("--quant_group_size", type=int, default=128)
+ parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
main(args)
diff --git a/applications/Chat/coati/dataset/__init__.py b/applications/Chat/coati/dataset/__init__.py
index bd4e5460d11e..599b57609775 100644
--- a/applications/Chat/coati/dataset/__init__.py
+++ b/applications/Chat/coati/dataset/__init__.py
@@ -4,7 +4,10 @@
from .utils import is_rank_0
__all__ = [
- 'RmStaticDataset', 'HhRlhfDataset',
- 'SFTDataset', 'SupervisedDataset',
- 'PromptDataset', 'is_rank_0',
+ "RmStaticDataset",
+ "HhRlhfDataset",
+ "SFTDataset",
+ "SupervisedDataset",
+ "PromptDataset",
+ "is_rank_0",
]
diff --git a/applications/Chat/coati/dataset/conversation.py b/applications/Chat/coati/dataset/conversation.py
index 465fa867c7ab..f2180d96b0d3 100644
--- a/applications/Chat/coati/dataset/conversation.py
+++ b/applications/Chat/coati/dataset/conversation.py
@@ -49,7 +49,7 @@ def append_message(self, role, message):
def to_gradio_chatbot(self):
ret = []
- for i, (role, msg) in enumerate(self.messages[self.offset:]):
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
if i % 2 == 0:
ret.append([msg, None])
else:
@@ -57,12 +57,14 @@ def to_gradio_chatbot(self):
return ret
def copy(self):
- return Conversation(system=self.system,
- roles=self.roles,
- messages=[[x, y] for x, y in self.messages],
- offset=self.offset,
- sep_style=self.sep_style,
- sep=self.sep)
+ return Conversation(
+ system=self.system,
+ roles=self.roles,
+ messages=[[x, y] for x, y in self.messages],
+ offset=self.offset,
+ sep_style=self.sep_style,
+ sep=self.sep,
+ )
def dict(self):
return {
@@ -70,7 +72,7 @@ def dict(self):
"roles": self.roles,
"messages": self.messages,
"offset": self.offset,
- "sep": self.sep
+ "sep": self.sep,
}
diff --git a/applications/Chat/coati/dataset/prompt_dataset.py b/applications/Chat/coati/dataset/prompt_dataset.py
index 2c953fffa513..17120e6064b5 100644
--- a/applications/Chat/coati/dataset/prompt_dataset.py
+++ b/applications/Chat/coati/dataset/prompt_dataset.py
@@ -13,11 +13,13 @@
class PromptDataset(Dataset):
"""Dataset for supervised fine-tuning."""
- def __init__(self,
- data_path: str,
- tokenizer: transformers.PreTrainedTokenizer,
- max_datasets_size: int = None,
- max_length: int = 96):
+ def __init__(
+ self,
+ data_path: str,
+ tokenizer: transformers.PreTrainedTokenizer,
+ max_datasets_size: int = None,
+ max_length: int = 96,
+ ):
super(PromptDataset, self).__init__()
self.keyed_prompt = defaultdict(list)
self.logger = get_dist_logger()
@@ -30,11 +32,9 @@ def __init__(self,
list_data_dict = list_data_dict[:max_datasets_size]
instructions = [data_dict["instruction"] for data_dict in list_data_dict]
- tokens = tokenizer(instructions,
- return_tensors='pt',
- max_length=max_length,
- padding='max_length',
- truncation=True)
+ tokens = tokenizer(
+ instructions, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True
+ )
for k, tensor in tokens.items():
self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()
diff --git a/applications/Chat/coati/dataset/reward_dataset.py b/applications/Chat/coati/dataset/reward_dataset.py
index 3c4ec8b214bb..3afcd7b69238 100644
--- a/applications/Chat/coati/dataset/reward_dataset.py
+++ b/applications/Chat/coati/dataset/reward_dataset.py
@@ -20,44 +20,31 @@ class RmStaticDataset(Dataset):
def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
super().__init__()
- self.end_token = tokenizer.eos_token \
- if special_token is None else special_token
-
- chosen = [
- data["prompt"] + data["chosen"] + self.end_token
- for data in tqdm(dataset, disable=not is_rank_0())
- ]
- chosen_token = tokenizer(chosen,
- max_length=max_length,
- padding="max_length",
- truncation=True,
- return_tensors="pt")
- self.chosen = {
- "input_ids": chosen_token["input_ids"],
- "attention_mask": chosen_token["attention_mask"]
- }
-
- reject = [
- data["prompt"] + data["rejected"] + self.end_token
- for data in tqdm(dataset, disable=not is_rank_0())
- ]
- reject_token = tokenizer(reject,
- max_length=max_length,
- padding="max_length",
- truncation=True,
- return_tensors="pt")
- self.reject = {
- "input_ids": reject_token["input_ids"],
- "attention_mask": reject_token["attention_mask"]
- }
+ self.end_token = tokenizer.eos_token if special_token is None else special_token
+
+ chosen = [data["prompt"] + data["chosen"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+ chosen_token = tokenizer(
+ chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+ self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
+
+ reject = [data["prompt"] + data["rejected"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+ reject_token = tokenizer(
+ reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+ self.reject = {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
def __len__(self):
length = self.chosen["input_ids"].shape[0]
return length
def __getitem__(self, idx):
- return self.chosen["input_ids"][idx], self.chosen["attention_mask"][idx], \
- self.reject["input_ids"][idx], self.reject["attention_mask"][idx]
+ return (
+ self.chosen["input_ids"][idx],
+ self.chosen["attention_mask"][idx],
+ self.reject["input_ids"][idx],
+ self.reject["attention_mask"][idx],
+ )
# Anthropic/hh-rlhf
@@ -74,41 +61,28 @@ class HhRlhfDataset(Dataset):
def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
super().__init__()
- self.end_token = tokenizer.eos_token \
- if special_token is None else special_token
-
- chosen = [
- data["chosen"] + self.end_token
- for data in tqdm(dataset, disable=not is_rank_0())
- ]
- chosen_token = tokenizer(chosen,
- max_length=max_length,
- padding="max_length",
- truncation=True,
- return_tensors="pt")
- self.chosen = {
- "input_ids": chosen_token["input_ids"],
- "attention_mask": chosen_token["attention_mask"]
- }
-
- reject = [
- data["rejected"] + self.end_token
- for data in tqdm(dataset, disable=not is_rank_0())
- ]
- reject_token = tokenizer(reject,
- max_length=max_length,
- padding="max_length",
- truncation=True,
- return_tensors="pt")
- self.reject = {
- "input_ids": reject_token["input_ids"],
- "attention_mask": reject_token["attention_mask"]
- }
+ self.end_token = tokenizer.eos_token if special_token is None else special_token
+
+ chosen = [data["chosen"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+ chosen_token = tokenizer(
+ chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+ self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
+
+ reject = [data["rejected"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+ reject_token = tokenizer(
+ reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+ self.reject = {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
def __len__(self):
length = self.chosen["input_ids"].shape[0]
return length
def __getitem__(self, idx):
- return self.chosen["input_ids"][idx], self.chosen["attention_mask"][idx], \
- self.reject["input_ids"][idx], self.reject["attention_mask"][idx]
+ return (
+ self.chosen["input_ids"][idx],
+ self.chosen["attention_mask"][idx],
+ self.reject["input_ids"][idx],
+ self.reject["attention_mask"][idx],
+ )
diff --git a/applications/Chat/coati/dataset/sft_dataset.py b/applications/Chat/coati/dataset/sft_dataset.py
index 2959d3fac81c..c0e257f54a07 100644
--- a/applications/Chat/coati/dataset/sft_dataset.py
+++ b/applications/Chat/coati/dataset/sft_dataset.py
@@ -13,13 +13,14 @@
# limitations under the License.
import copy
-from typing import Dict, Sequence, Tuple
+from typing import Dict, Optional, Sequence, Tuple
import torch
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import PreTrainedTokenizer
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+
from colossalai.logging import get_dist_logger
from .utils import is_rank_0, jload
@@ -28,33 +29,35 @@
IGNORE_INDEX = -100
PROMPT_DICT = {
- "prompt_input": ("Below is an instruction that describes a task, paired with an input that provides further context. "
- "Write a response that appropriately completes the request.\n\n"
- "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
- "prompt_no_input": ("Below is an instruction that describes a task. "
- "Write a response that appropriately completes the request.\n\n"
- "### Instruction:\n{instruction}\n\n### Response:"),
+ "prompt_input": (
+ "Below is an instruction that describes a task, paired with an input that provides further context. "
+ "Write a response that appropriately completes the request.\n\n"
+ "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+ ),
+ "prompt_no_input": (
+ "Below is an instruction that describes a task. "
+ "Write a response that appropriately completes the request.\n\n"
+ "### Instruction:\n{instruction}\n\n### Response:"
+ ),
}
-def _preprocess(sources: Sequence[str],
- targets: Sequence[str],
- tokenizer: PreTrainedTokenizer,
- max_length: int,
- ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+def _preprocess(
+ sources: Sequence[str],
+ targets: Sequence[str],
+ tokenizer: PreTrainedTokenizer,
+ max_length: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Preprocess the data by tokenizing."""
sequences = [s + t for s, t in zip(sources, targets)]
- sequences_token = tokenizer(sequences,
- max_length=max_length,
- padding="max_length",
- truncation=True,
- return_tensors="pt")
- sources_token = tokenizer(sources,
- max_length=max_length,
- padding="max_length",
- truncation=True,
- return_tensors="pt")
-
+ sequences_token = tokenizer(
+ sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+ sources_token = tokenizer(
+ sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+
+ assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
labels = copy.deepcopy(sequences_token["input_ids"])
for i in range(labels.shape[0]):
source_len = sources_token["attention_mask"][i].sum().item()
@@ -62,25 +65,27 @@ def _preprocess(sources: Sequence[str],
if tokenizer.padding_side == "right":
# |prompt|completion|eos|pad|
labels[i][:source_len] = IGNORE_INDEX
+ labels[i][-pad_len:] = IGNORE_INDEX
elif tokenizer.padding_side == "left":
# |pad|prompt|completion|eos|
- labels[i][pad_len:pad_len + source_len] = IGNORE_INDEX
+ labels[i][: pad_len + source_len] = IGNORE_INDEX
else:
raise RuntimeError()
return sequences_token["input_ids"], labels, sequences_token["attention_mask"]
-def _preprocess_chatglm(sources: Sequence[str],
- targets: Sequence[str],
- tokenizer: PreTrainedTokenizer,
- max_length: int,
- ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+def _preprocess_chatglm(
+ sources: Sequence[str],
+ targets: Sequence[str],
+ tokenizer: PreTrainedTokenizer,
+ max_length: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Preprocess the data by tokenizing.
None for attention mask, ChatGLM will calculate attention mask according to input ids
"""
-
+
labels = []
input_ids = []
for source, target in zip(sources, targets):
@@ -90,16 +95,16 @@ def _preprocess_chatglm(sources: Sequence[str],
# truncate
sp_token_list = [tokenizer.gmask_token_id, tokenizer.bos_token_id]
truncate_length = max(0, len(input_id) - max_length)
- input_id = input_id[truncate_length: ]
+ input_id = input_id[truncate_length:]
if truncate_length == len(source_id) + 1:
- input_id = sp_token_list + input_id[1: ]
+ input_id = sp_token_list + input_id[1:]
elif truncate_length > len(source_id) + 1:
- input_id = sp_token_list + input_id[2: ]
-
+ input_id = sp_token_list + input_id[2:]
+
context_length = input_id.index(tokenizer.bos_token_id)
mask_position = context_length - 1
- label = [IGNORE_INDEX] * context_length + input_id[mask_position+1:]
-
+ label = [IGNORE_INDEX] * context_length + input_id[mask_position + 1 :]
+
pad_len = max_length - len(input_id)
input_id = input_id + [tokenizer.pad_token_id] * pad_len
input_ids.append(input_id)
@@ -117,25 +122,22 @@ class SFTDataset(Dataset):
max_length: max length of input
"""
- def __init__(self,
- dataset: Dict,
- tokenizer: PreTrainedTokenizer,
- max_length: int = 512
- ) -> None:
+ def __init__(self, dataset: Dict, tokenizer: PreTrainedTokenizer, max_length: int = 512) -> None:
super().__init__()
self.input_ids = []
sources = [data["prompt"] for data in dataset]
- targets = [
- data["completion"] + tokenizer.eos_token
- for data in tqdm(dataset, disable=not is_rank_0())
- ]
+ targets = [data["completion"] + tokenizer.eos_token for data in tqdm(dataset, disable=not is_rank_0())]
+
+ logger.info("Tokenizing inputs... This may take some time...")
if isinstance(tokenizer, ChatGLMTokenizer):
- self.input_ids, self.labels, self.attention_mask = \
- _preprocess_chatglm(sources, targets, tokenizer, max_length)
+ self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
+ sources, targets, tokenizer, max_length
+ )
else:
- self.input_ids, self.labels, self.attention_mask = \
- _preprocess(sources, targets, tokenizer, max_length)
+ self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)
+
+ logger.info("Loaded dataset.")
def __len__(self):
length = self.input_ids.shape[0]
@@ -143,22 +145,21 @@ def __len__(self):
def __getitem__(self, idx):
if self.attention_mask is not None:
- return dict(input_ids=self.input_ids[idx],
- labels=self.labels[idx],
- attention_mask=self.attention_mask[idx])
+ return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
else:
- return dict(input_ids=self.input_ids[idx],
- labels=self.labels[idx])
+ return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])
class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""
- def __init__(self,
- data_path: str,
- tokenizer: PreTrainedTokenizer,
- max_datasets_size: int = None,
- max_length: int = 512):
+ def __init__(
+ self,
+ data_path: str,
+ tokenizer: PreTrainedTokenizer,
+ max_datasets_size: Optional[int] = None,
+ max_length: int = 512,
+ ):
super().__init__()
logger.info("Loading data...")
list_data_dict = jload(data_path)
@@ -174,18 +175,17 @@ def __init__(self,
prompt_input.format_map(example) if "input" in example else prompt_no_input.format_map(example)
for example in list_data_dict
]
- targets = [
- example['output'] + tokenizer.eos_token
- for example in list_data_dict
- ]
+ targets = [example["output"] + tokenizer.eos_token for example in list_data_dict]
logger.info("Tokenizing inputs... This may take some time...")
if isinstance(tokenizer, ChatGLMTokenizer):
- self.input_ids, self.labels, self.attention_mask = \
- _preprocess_chatglm(sources, targets, tokenizer, max_length)
+ self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
+ sources, targets, tokenizer, max_length
+ )
else:
- self.input_ids, self.labels, self.attention_mask = \
- _preprocess(sources, targets, tokenizer, max_length)
+ self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)
+
+ logger.info("Loaded dataset.")
def __len__(self):
length = self.input_ids.shape[0]
@@ -193,9 +193,6 @@ def __len__(self):
def __getitem__(self, idx):
if self.attention_mask is not None:
- return dict(input_ids=self.input_ids[idx],
- labels=self.labels[idx],
- attention_mask=self.attention_mask[idx])
+ return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
else:
- return dict(input_ids=self.input_ids[idx],
- labels=self.labels[idx])
+ return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])
diff --git a/applications/Chat/coati/experience_buffer/__init__.py b/applications/Chat/coati/experience_buffer/__init__.py
index c0188dc4a471..f2a48d0a3b20 100644
--- a/applications/Chat/coati/experience_buffer/__init__.py
+++ b/applications/Chat/coati/experience_buffer/__init__.py
@@ -1,4 +1,4 @@
from .base import ExperienceBuffer
from .naive import NaiveExperienceBuffer
-__all__ = ['ExperienceBuffer', 'NaiveExperienceBuffer']
+__all__ = ["ExperienceBuffer", "NaiveExperienceBuffer"]
diff --git a/applications/Chat/coati/experience_buffer/base.py b/applications/Chat/coati/experience_buffer/base.py
index 9ccdc935d506..7047785308f3 100644
--- a/applications/Chat/coati/experience_buffer/base.py
+++ b/applications/Chat/coati/experience_buffer/base.py
@@ -7,9 +7,9 @@
class ExperienceBuffer(ABC):
"""Experience buffer base class. It stores experience.
- Args:
- sample_batch_size (int): Batch size when sampling.
- limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
+ Args:
+ sample_batch_size (int): Batch size when sampling.
+ limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
"""
def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
diff --git a/applications/Chat/coati/experience_buffer/naive.py b/applications/Chat/coati/experience_buffer/naive.py
index bd5213b38993..d47b67dbe713 100644
--- a/applications/Chat/coati/experience_buffer/naive.py
+++ b/applications/Chat/coati/experience_buffer/naive.py
@@ -1,4 +1,5 @@
import random
+import warnings
from typing import List
import torch
@@ -11,28 +12,30 @@
class NaiveExperienceBuffer(ExperienceBuffer):
"""Naive experience buffer class. It stores experience.
- Args:
- sample_batch_size (int): Batch size when sampling.
- limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
- cpu_offload (bool, optional): Whether to offload experience to cpu when sampling. Defaults to True.
+ Args:
+ sample_batch_size (int): Batch size when sampling.
+ limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
+ cpu_offload (bool, optional): Whether to offload experience to cpu when sampling. Defaults to True.
"""
def __init__(self, sample_batch_size: int, limit: int = 0, cpu_offload: bool = True) -> None:
super().__init__(sample_batch_size, limit)
self.cpu_offload = cpu_offload
- self.target_device = torch.device(f'cuda:{torch.cuda.current_device()}')
+ self.target_device = torch.device(f"cuda:{torch.cuda.current_device()}")
# TODO(ver217): add prefetch
self.items: List[BufferItem] = []
@torch.no_grad()
def append(self, experience: Experience) -> None:
if self.cpu_offload:
- experience.to_device(torch.device('cpu'))
+ experience.to_device(torch.device("cpu"))
items = split_experience_batch(experience)
self.items.extend(items)
+
if self.limit > 0:
samples_to_remove = len(self.items) - self.limit
if samples_to_remove > 0:
+ warnings.warn(f"Experience buffer is full. Removing {samples_to_remove} samples.")
self.items = self.items[samples_to_remove:]
def clear(self) -> None:
diff --git a/applications/Chat/coati/experience_buffer/utils.py b/applications/Chat/coati/experience_buffer/utils.py
index c2a34212e2f4..baedbebd184f 100644
--- a/applications/Chat/coati/experience_buffer/utils.py
+++ b/applications/Chat/coati/experience_buffer/utils.py
@@ -21,6 +21,7 @@ class BufferItem:
"A" is the number of actions.
"""
+
sequences: torch.Tensor
action_log_probs: torch.Tensor
values: torch.Tensor
@@ -33,8 +34,7 @@ class BufferItem:
def split_experience_batch(experience: Experience) -> List[BufferItem]:
batch_size = experience.sequences.size(0)
batch_kwargs = [{} for _ in range(batch_size)]
- keys = ('sequences', 'action_log_probs', 'values',
- 'reward', 'advantages', 'attention_mask', 'action_mask')
+ keys = ("sequences", "action_log_probs", "values", "reward", "advantages", "attention_mask", "action_mask")
for key in keys:
value = getattr(experience, key)
if isinstance(value, torch.Tensor):
@@ -49,22 +49,21 @@ def split_experience_batch(experience: Experience) -> List[BufferItem]:
return items
-def _zero_pad_sequences(sequences: List[torch.Tensor], side: str = 'left') -> torch.Tensor:
- assert side in ('left', 'right')
+def _zero_pad_sequences(sequences: List[torch.Tensor], side: str = "left") -> torch.Tensor:
+ assert side in ("left", "right")
max_len = max(seq.size(0) for seq in sequences)
padded_sequences = []
for seq in sequences:
pad_len = max_len - seq.size(0)
- padding = (pad_len, 0) if side == 'left' else (0, pad_len)
+ padding = (pad_len, 0) if side == "left" else (0, pad_len)
padded_sequences.append(F.pad(seq, padding))
return torch.stack(padded_sequences, dim=0)
def make_experience_batch(items: List[BufferItem]) -> Experience:
kwargs = {}
- to_pad_keys = set(('action_log_probs', 'action_mask'))
- keys = ('sequences', 'action_log_probs', 'values',
- 'reward', 'advantages', 'attention_mask', 'action_mask')
+ to_pad_keys = set(("action_log_probs", "action_mask"))
+ keys = ("sequences", "action_log_probs", "values", "reward", "advantages", "attention_mask", "action_mask")
for key in keys:
vals = [getattr(item, key) for item in items]
if key in to_pad_keys:
diff --git a/applications/Chat/coati/experience_maker/__init__.py b/applications/Chat/coati/experience_maker/__init__.py
index 39ca7576b227..06452292e77c 100644
--- a/applications/Chat/coati/experience_maker/__init__.py
+++ b/applications/Chat/coati/experience_maker/__init__.py
@@ -1,4 +1,4 @@
from .base import Experience, ExperienceMaker
from .naive import NaiveExperienceMaker
-__all__ = ['Experience', 'ExperienceMaker', 'NaiveExperienceMaker']
+__all__ = ["Experience", "ExperienceMaker", "NaiveExperienceMaker"]
diff --git a/applications/Chat/coati/experience_maker/base.py b/applications/Chat/coati/experience_maker/base.py
index b4646f282f0c..0731f6e0f97f 100644
--- a/applications/Chat/coati/experience_maker/base.py
+++ b/applications/Chat/coati/experience_maker/base.py
@@ -3,8 +3,7 @@
from typing import Optional
import torch
-import torch.nn as nn
-from coati.models.base import Actor
+from coati.models.base import Actor, Critic, RewardModel
@dataclass
@@ -24,6 +23,7 @@ class Experience:
"A" is the number of actions.
"""
+
sequences: torch.Tensor
action_log_probs: torch.Tensor
values: torch.Tensor
@@ -58,20 +58,13 @@ def pin_memory(self):
class ExperienceMaker(ABC):
-
- def __init__(self,
- actor: Actor,
- critic: nn.Module,
- reward_model: nn.Module,
- initial_model: Actor,
- kl_coef: float = 0.1) -> None:
+ def __init__(self, actor: Actor, critic: Critic, reward_model: RewardModel, initial_model: Actor) -> None:
super().__init__()
self.actor = actor
self.critic = critic
self.reward_model = reward_model
self.initial_model = initial_model
- self.kl_coef = kl_coef
@abstractmethod
- def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
+ def make_experience(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **generate_kwargs) -> Experience:
pass
diff --git a/applications/Chat/coati/experience_maker/naive.py b/applications/Chat/coati/experience_maker/naive.py
index 496f8ab445fc..941e1994b148 100644
--- a/applications/Chat/coati/experience_maker/naive.py
+++ b/applications/Chat/coati/experience_maker/naive.py
@@ -1,7 +1,9 @@
import torch
import torch.nn.functional as F
+from coati.models.base import Actor, Critic, RewardModel
from coati.models.generation import generate
from coati.models.utils import calc_action_log_probs, compute_reward
+from transformers import PreTrainedTokenizer
from .base import Experience, ExperienceMaker
@@ -11,6 +13,19 @@ class NaiveExperienceMaker(ExperienceMaker):
Naive experience maker.
"""
+ def __init__(
+ self,
+ actor: Actor,
+ critic: Critic,
+ reward_model: RewardModel,
+ initial_model: Actor,
+ tokenizer: PreTrainedTokenizer,
+ kl_coef: float = 0.1,
+ ) -> None:
+ super().__init__(actor, critic, reward_model, initial_model)
+ self.tokenizer = tokenizer
+ self.kl_coef = kl_coef
+
@torch.no_grad()
def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
self.actor.eval()
@@ -19,33 +34,32 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie
self.reward_model.eval()
# generate sequences
- sequences = generate(self.actor, input_ids, **generate_kwargs)
+ sequences = generate(self.actor, input_ids, self.tokenizer, **generate_kwargs)
# calculate auxiliary tensors
attention_mask = None
- pad_token_id = generate_kwargs.get('pad_token_id', None)
+ pad_token_id = self.tokenizer.pad_token_id
if pad_token_id is not None:
- attention_mask = sequences.not_equal(pad_token_id)\
- .to(dtype=torch.long, device=sequences.device)
+ attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
input_len = input_ids.size(1)
- eos_token_id = generate_kwargs.get('eos_token_id', None)
+ eos_token_id = self.tokenizer.eos_token_id
if eos_token_id is None:
action_mask = torch.ones_like(sequences, dtype=torch.bool)
else:
# left padding may be applied, only mask action
action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
- action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input
+ action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input
action_mask[:, :input_len] = False
action_mask = action_mask[:, 1:]
- action_mask = action_mask[:, -(sequences.size(1) - input_len):]
+ action_mask = action_mask[:, -(sequences.size(1) - input_len) :]
num_actions = action_mask.size(1)
- actor_output = self.actor(sequences, attention_mask)
+ actor_output = self.actor(sequences, attention_mask)["logits"]
action_log_probs = calc_action_log_probs(actor_output, sequences, num_actions)
- base_model_output = self.initial_model(sequences, attention_mask)
+ base_model_output = self.initial_model(sequences, attention_mask)["logits"]
base_action_log_probs = calc_action_log_probs(base_model_output, sequences, num_actions)
- value = self.critic(sequences, action_mask, attention_mask)
+ value = self.critic(sequences, attention_mask)
r = self.reward_model(sequences, attention_mask)
reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
diff --git a/applications/Chat/coati/kernels/__init__.py b/applications/Chat/coati/kernels/__init__.py
index 230eedf7ecba..96d40c7c4709 100644
--- a/applications/Chat/coati/kernels/__init__.py
+++ b/applications/Chat/coati/kernels/__init__.py
@@ -1,6 +1,6 @@
from .wrapper import convert_to_xformer_model, recover_from_xformer_model
__all__ = [
- 'convert_to_xformer_model',
- 'recover_from_xformer_model',
+ "convert_to_xformer_model",
+ "recover_from_xformer_model",
]
diff --git a/applications/Chat/coati/kernels/opt_attn.py b/applications/Chat/coati/kernels/opt_attn.py
index e99f9c2247d1..d1eb139187f3 100644
--- a/applications/Chat/coati/kernels/opt_attn.py
+++ b/applications/Chat/coati/kernels/opt_attn.py
@@ -21,11 +21,12 @@ def forward(
output_attentions: bool = False,
) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
if not self.training:
- return super().forward(hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask,
- output_attentions)
+ return super().forward(
+ hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask, output_attentions
+ )
"""Input shape: Batch x Time x Channel"""
- assert layer_head_mask is None, 'Xformers attention does not support layer_head_mask'
- assert not output_attentions, 'Xformers attention does not support output_attentions'
+ assert layer_head_mask is None, "Xformers attention does not support layer_head_mask"
+ assert not output_attentions, "Xformers attention does not support output_attentions"
# if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder
@@ -69,12 +70,14 @@ def forward(
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
- attn_output = xops.memory_efficient_attention(query_states,
- key_states,
- value_states,
- attn_bias=xops.LowerTriangularMask(),
- p=self.dropout if self.training else 0.0,
- scale=self.scaling)
+ attn_output = xops.memory_efficient_attention(
+ query_states,
+ key_states,
+ value_states,
+ attn_bias=xops.LowerTriangularMask(),
+ p=self.dropout if self.training else 0.0,
+ scale=self.scaling,
+ )
# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned across GPUs when using tensor-parallelism.
diff --git a/applications/Chat/coati/models/__init__.py b/applications/Chat/coati/models/__init__.py
index 0a296a863756..ad4a525b4af2 100644
--- a/applications/Chat/coati/models/__init__.py
+++ b/applications/Chat/coati/models/__init__.py
@@ -3,6 +3,13 @@
from .loss import LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
__all__ = [
- 'Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'LogSigLoss', 'LogExpLoss',
- 'LoRAModule', 'convert_to_lora_module'
+ "Actor",
+ "Critic",
+ "RewardModel",
+ "PolicyLoss",
+ "ValueLoss",
+ "LogSigLoss",
+ "LogExpLoss",
+ "LoRAModule",
+ "convert_to_lora_module",
]
diff --git a/applications/Chat/coati/models/base/__init__.py b/applications/Chat/coati/models/base/__init__.py
index c5f748a0c85a..5c9905bb2224 100644
--- a/applications/Chat/coati/models/base/__init__.py
+++ b/applications/Chat/coati/models/base/__init__.py
@@ -9,7 +9,7 @@
def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module:
"""Get the base model of our wrapper classes.
- For Actor, Critic and RewardModel, return ``model.model``,
+ For Actor, Critic and RewardModel, return ``model.model``,
it's usually a ``transformers.PreTrainedModel``.
Args:
@@ -18,9 +18,10 @@ def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module:
Returns:
nn.Module: the base model
"""
- assert isinstance(model, (Actor, Critic, RewardModel)), \
- f'Expect Actor, Critic or RewardModel, got {type(model)}, use unwrap_model first.'
+ assert isinstance(
+ model, (Actor, Critic, RewardModel)
+ ), f"Expect Actor, Critic or RewardModel, got {type(model)}, use unwrap_model first."
return model.model
-__all__ = ['Actor', 'Critic', 'RewardModel', 'get_base_model']
+__all__ = ["Actor", "Critic", "RewardModel", "get_base_model"]
diff --git a/applications/Chat/coati/models/base/actor.py b/applications/Chat/coati/models/base/actor.py
index 6842f81d9b87..8b2b81ed071c 100644
--- a/applications/Chat/coati/models/base/actor.py
+++ b/applications/Chat/coati/models/base/actor.py
@@ -16,18 +16,18 @@ class Actor(LoRAModule):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = 'none') -> None:
+ def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = "none") -> None:
super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
self.model = model
self.convert_to_lora()
def forward(
- self,
- input_ids: torch.LongTensor,
- attention_mask: Optional[torch.Tensor] = None,
- **model_kwargs, # HACK: `generate` method may pass more kwargs
+ self,
+ input_ids: torch.LongTensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ **model_kwargs,
) -> torch.Tensor:
- """Returns model output.
- """
+ """Returns model output."""
output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
return output
+
diff --git a/applications/Chat/coati/models/base/critic.py b/applications/Chat/coati/models/base/critic.py
index e68a743a7762..8672365f5783 100644
--- a/applications/Chat/coati/models/base/critic.py
+++ b/applications/Chat/coati/models/base/critic.py
@@ -1,10 +1,7 @@
-from typing import Optional
-
import torch
import torch.nn as nn
from ..lora import LoRAModule
-from ..utils import masked_mean
class Critic(LoRAModule):
@@ -19,36 +16,19 @@ class Critic(LoRAModule):
"""
def __init__(
- self,
- model: nn.Module,
- value_head: nn.Module,
- lora_rank: int = 0,
- lora_train_bias: str = 'none',
- use_action_mask: bool = False,
+ self, model: nn.Module, value_head: nn.Module, lora_rank: int = 0, lora_train_bias: str = "none"
) -> None:
-
super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
self.model = model
self.value_head = value_head
- self.use_action_mask = use_action_mask
self.convert_to_lora()
- def forward(self,
- sequences: torch.LongTensor,
- action_mask: Optional[torch.Tensor] = None,
- attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+ def forward(self, sequences: torch.LongTensor, attention_mask: torch.Tensor) -> torch.Tensor:
outputs = self.model(sequences, attention_mask=attention_mask)
- last_hidden_states = outputs['last_hidden_state']
-
- values = self.value_head(last_hidden_states).squeeze(-1)
-
- if action_mask is not None and self.use_action_mask:
- num_actions = action_mask.size(1)
- prompt_mask = attention_mask[:, :-num_actions]
- values = values[:, :-num_actions]
- value = masked_mean(values, prompt_mask, dim=1)
- return value
-
- values = values[:, :-1]
- value = values.mean(dim=1)
- return value
+ last_hidden_states = outputs["last_hidden_state"]
+ sequence_lengths = torch.max(attention_mask * torch.arange(sequences.size(1), device=sequences.device), dim=1)[
+ 0
+ ]
+ sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths]
+ values = self.value_head(sequence_hidden_states).squeeze(1) # ensure shape is (B, )
+ return values
diff --git a/applications/Chat/coati/models/base/reward_model.py b/applications/Chat/coati/models/base/reward_model.py
index ce8c0a1d3568..e9545d1cddaf 100644
--- a/applications/Chat/coati/models/base/reward_model.py
+++ b/applications/Chat/coati/models/base/reward_model.py
@@ -17,11 +17,13 @@ class RewardModel(LoRAModule):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- model: nn.Module,
- value_head: Optional[nn.Module] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none') -> None:
+ def __init__(
+ self,
+ model: nn.Module,
+ value_head: Optional[nn.Module] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ ) -> None:
super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
self.model = model
self.convert_to_lora()
@@ -33,9 +35,12 @@ def __init__(self,
else:
self.value_head = nn.Linear(model.config.n_embd, 1)
- def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+ def forward(self, sequences: torch.LongTensor, attention_mask: torch.Tensor) -> torch.Tensor:
outputs = self.model(sequences, attention_mask=attention_mask)
- last_hidden_states = outputs['last_hidden_state']
- values = self.value_head(last_hidden_states)[:, :-1]
- value = values.mean(dim=1).squeeze(1) # ensure shape is (B)
- return value
+ last_hidden_states = outputs["last_hidden_state"]
+ sequence_lengths = torch.max(attention_mask * torch.arange(sequences.size(1), device=sequences.device), dim=1)[
+ 0
+ ]
+ sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths]
+ values = self.value_head(sequence_hidden_states).squeeze(1) # ensure shape is (B, )
+ return values
diff --git a/applications/Chat/coati/models/bloom/__init__.py b/applications/Chat/coati/models/bloom/__init__.py
index d0e7f7b1ef94..7af199a67d3b 100644
--- a/applications/Chat/coati/models/bloom/__init__.py
+++ b/applications/Chat/coati/models/bloom/__init__.py
@@ -2,4 +2,4 @@
from .bloom_critic import BLOOMCritic
from .bloom_rm import BLOOMRM
-__all__ = ['BLOOMActor', 'BLOOMCritic', 'BLOOMRM']
+__all__ = ["BLOOMActor", "BLOOMCritic", "BLOOMRM"]
diff --git a/applications/Chat/coati/models/bloom/bloom_actor.py b/applications/Chat/coati/models/bloom/bloom_actor.py
index d7577f096493..73855a2245e7 100644
--- a/applications/Chat/coati/models/bloom/bloom_actor.py
+++ b/applications/Chat/coati/models/bloom/bloom_actor.py
@@ -1,7 +1,6 @@
from typing import Optional
-import torch
-from transformers import BloomConfig, BloomForCausalLM, BloomModel
+from transformers import BloomConfig, BloomForCausalLM
from ..base import Actor
@@ -18,12 +17,14 @@ class BLOOMActor(Actor):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: str = None,
- config: Optional[BloomConfig] = None,
- checkpoint: bool = False,
- lora_rank: int = 0,
- lora_train_bias: str = 'none') -> None:
+ def __init__(
+ self,
+ pretrained: str = None,
+ config: Optional[BloomConfig] = None,
+ checkpoint: bool = False,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ ) -> None:
if pretrained is not None:
model = BloomForCausalLM.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/bloom/bloom_critic.py b/applications/Chat/coati/models/bloom/bloom_critic.py
index a3716ca94138..b2d838f7ffc5 100644
--- a/applications/Chat/coati/models/bloom/bloom_critic.py
+++ b/applications/Chat/coati/models/bloom/bloom_critic.py
@@ -1,8 +1,7 @@
from typing import Optional
-import torch
import torch.nn as nn
-from transformers import BloomConfig, BloomForCausalLM, BloomModel
+from transformers import BloomConfig, BloomModel
from ..base import Critic
@@ -18,12 +17,14 @@ class BLOOMCritic(Critic):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: str = None,
- config: Optional[BloomConfig] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none',
- **kwargs) -> None:
+ def __init__(
+ self,
+ pretrained: str = None,
+ config: Optional[BloomConfig] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ **kwargs,
+ ) -> None:
if pretrained is not None:
model = BloomModel.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/bloom/bloom_rm.py b/applications/Chat/coati/models/bloom/bloom_rm.py
index e6ca9b1d4851..c09457ddc8c7 100644
--- a/applications/Chat/coati/models/bloom/bloom_rm.py
+++ b/applications/Chat/coati/models/bloom/bloom_rm.py
@@ -1,7 +1,7 @@
from typing import Optional
import torch.nn as nn
-from transformers import BloomConfig, BloomForCausalLM, BloomModel
+from transformers import BloomConfig, BloomModel
from ..base import RewardModel
@@ -17,11 +17,13 @@ class BLOOMRM(RewardModel):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: str = None,
- config: Optional[BloomConfig] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none') -> None:
+ def __init__(
+ self,
+ pretrained: str = None,
+ config: Optional[BloomConfig] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ ) -> None:
if pretrained is not None:
model = BloomModel.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/chatglm/__init__.py b/applications/Chat/coati/models/chatglm/__init__.py
index 373f19553fdc..5956f5a8e91b 100644
--- a/applications/Chat/coati/models/chatglm/__init__.py
+++ b/applications/Chat/coati/models/chatglm/__init__.py
@@ -1,3 +1,3 @@
from .chatglm_actor import ChatGLMActor
-__all__ = ['ChatGLMActor']
\ No newline at end of file
+__all__ = ["ChatGLMActor"]
diff --git a/applications/Chat/coati/models/chatglm/chatglm_actor.py b/applications/Chat/coati/models/chatglm/chatglm_actor.py
index c35d994e9319..00a61561ee47 100644
--- a/applications/Chat/coati/models/chatglm/chatglm_actor.py
+++ b/applications/Chat/coati/models/chatglm/chatglm_actor.py
@@ -1,11 +1,9 @@
from typing import Optional
-import torch
+from ..base import Actor
from .configuration_chatglm import ChatGLMConfig
from .modeling_chatglm import ChatGLMForConditionalGeneration
-from ..base import Actor
-
class ChatGLMActor(Actor):
"""
@@ -19,10 +17,9 @@ class ChatGLMActor(Actor):
do not support lora for now.
"""
- def __init__(self,
- pretrained: str = None,
- config: Optional[ChatGLMConfig] = None,
- checkpoint: bool = False) -> None:
+ def __init__(
+ self, pretrained: str = None, config: Optional[ChatGLMConfig] = None, checkpoint: bool = False
+ ) -> None:
if pretrained is not None:
model = ChatGLMForConditionalGeneration.from_pretrained(pretrained)
elif config is not None:
@@ -31,4 +28,4 @@ def __init__(self,
model = ChatGLMForConditionalGeneration(ChatGLMConfig())
if checkpoint:
model.gradient_checkpointing_enable()
- super().__init__(model, lora_rank=0, lora_train_bias='none')
+ super().__init__(model, lora_rank=0, lora_train_bias="none")
diff --git a/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py b/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
index f7717f7e68b6..221ef044b470 100644
--- a/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
+++ b/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
@@ -2,15 +2,14 @@
This code is copied from https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py
"""
"""Tokenization classes for ChatGLM."""
-from typing import List, Optional, Union
import os
+from typing import Dict, List, Optional, Union
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.utils import logging, PaddingStrategy
-from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
-from typing import Dict
-import sentencepiece as spm
import numpy as np
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
+from transformers.utils import PaddingStrategy, logging
logger = logging.get_logger(__name__)
@@ -52,11 +51,11 @@ def __len__(self):
class SPTokenizer:
def __init__(
- self,
- vocab_file,
- num_image_tokens=20000,
- max_blank_length=80,
- byte_fallback=True,
+ self,
+ vocab_file,
+ num_image_tokens=20000,
+ max_blank_length=80,
+ byte_fallback=True,
):
assert vocab_file is not None
self.vocab_file = vocab_file
@@ -100,9 +99,7 @@ def _preprocess(self, text: str, linebreak=True, whitespaces=True):
text = self._encode_whitespaces(text, max_len=self.max_blank_length)
return text
- def encode(
- self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
- ) -> List[int]:
+ def encode(self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True) -> List[int]:
"""
@param text: Text to encode.
@param linebreak: Whether to encode newline (\n) in text.
@@ -136,9 +133,7 @@ def decode_tokens(self, tokens: List[str]) -> str:
text = self.postprocess(text)
return text
- def tokenize(
- self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
- ) -> List[str]:
+ def tokenize(self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True) -> List[str]:
"""
@param text: Text to encode.
@param linebreak: Whether to encode newline (\n) in text.
@@ -181,20 +176,20 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
model_input_names = ["input_ids", "attention_mask", "position_ids"]
def __init__(
- self,
- vocab_file,
- do_lower_case=False,
- remove_space=False,
- bos_token='',
- eos_token='',
- end_token='',
- mask_token='[MASK]',
- gmask_token='[gMASK]',
- padding_side="left",
- pad_token="",
- unk_token="",
- num_image_tokens=20000,
- **kwargs
+ self,
+ vocab_file,
+ do_lower_case=False,
+ remove_space=False,
+ bos_token="",
+ eos_token="",
+ end_token="",
+ mask_token="[MASK]",
+ gmask_token="[gMASK]",
+ padding_side="left",
+ pad_token="",
+ unk_token="",
+ num_image_tokens=20000,
+ **kwargs,
) -> None:
super().__init__(
do_lower_case=do_lower_case,
@@ -208,7 +203,7 @@ def __init__(
pad_token=pad_token,
unk_token=unk_token,
num_image_tokens=num_image_tokens,
- **kwargs
+ **kwargs,
)
self.do_lower_case = do_lower_case
@@ -243,11 +238,11 @@ def end_token_id(self) -> Optional[int]:
@property
def vocab_size(self):
- """ Returns vocab size """
+ """Returns vocab size"""
return self.sp_tokenizer.num_tokens
def get_vocab(self):
- """ Returns vocab as a dict """
+ """Returns vocab as a dict"""
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
@@ -264,7 +259,7 @@ def preprocess_text(self, inputs):
return outputs
def _tokenize(self, text, **kwargs):
- """ Returns a tokenized string. """
+ """Returns a tokenized string."""
text = self.preprocess_text(text)
seq = self.sp_tokenizer.tokenize(text)
@@ -274,11 +269,7 @@ def _tokenize(self, text, **kwargs):
def convert_tokens_to_string(self, tokens: List[str]) -> str:
return self.sp_tokenizer.decode_tokens(tokens)
- def _decode(
- self,
- token_ids: Union[int, List[int]],
- **kwargs
- ) -> str:
+ def _decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
if isinstance(token_ids, int):
token_ids = [token_ids]
if len(token_ids) == 0:
@@ -288,7 +279,7 @@ def _decode(
return super()._decode(token_ids, **kwargs)
def _convert_token_to_id(self, token):
- """ Converts a token (str) in an id using the vocab. """
+ """Converts a token (str) in an id using the vocab."""
return self.sp_tokenizer[token]
def _convert_id_to_token(self, index):
@@ -309,13 +300,11 @@ def save_vocabulary(self, save_directory, filename_prefix=None):
`Tuple(str)`: Paths to the files saved.
"""
if os.path.isdir(save_directory):
- vocab_file = os.path.join(
- save_directory, self.vocab_files_names["vocab_file"]
- )
+ vocab_file = os.path.join(save_directory, self.vocab_files_names["vocab_file"])
else:
vocab_file = save_directory
- with open(self.vocab_file, 'rb') as fin:
+ with open(self.vocab_file, "rb") as fin:
proto_str = fin.read()
with open(vocab_file, "wb") as writer:
@@ -324,7 +313,7 @@ def save_vocabulary(self, save_directory, filename_prefix=None):
return (vocab_file,)
def build_inputs_with_special_tokens(
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
@@ -343,19 +332,19 @@ def build_inputs_with_special_tokens(
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
gmask_id = self.sp_tokenizer[self.gmask_token]
- eos_id = self.sp_tokenizer[self.eos_token]
+ self.sp_tokenizer[self.eos_token]
token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
if token_ids_1 is not None:
token_ids_0 = token_ids_0 + token_ids_1
return token_ids_0
def _pad(
- self,
- encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
- max_length: Optional[int] = None,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- pad_to_multiple_of: Optional[int] = None,
- return_attention_mask: Optional[bool] = None,
+ self,
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+ max_length: Optional[int] = None,
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+ pad_to_multiple_of: Optional[int] = None,
+ return_attention_mask: Optional[bool] = None,
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
@@ -421,17 +410,23 @@ def _pad(
mask_position = required_input.index(mask_token)
position_ids[context_length:] = mask_position
block_position_ids = np.concatenate(
- [np.zeros(context_length, dtype=np.int64),
- np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
+ [
+ np.zeros(context_length, dtype=np.int64),
+ np.arange(1, seq_length - context_length + 1, dtype=np.int64),
+ ]
+ )
encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
if needs_to_be_padded:
difference = max_length - len(required_input)
if "attention_mask" in encoded_inputs:
- encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
- pad_width=[(0, 0), (difference, 0), (difference, 0)],
- mode='constant', constant_values=True)
+ encoded_inputs["attention_mask"] = np.pad(
+ encoded_inputs["attention_mask"],
+ pad_width=[(0, 0), (difference, 0), (difference, 0)],
+ mode="constant",
+ constant_values=True,
+ )
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
"token_type_ids"
@@ -439,8 +434,9 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
if "position_ids" in encoded_inputs:
- encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
- pad_width=[(0, 0), (difference, 0)])
+ encoded_inputs["position_ids"] = np.pad(
+ encoded_inputs["position_ids"], pad_width=[(0, 0), (difference, 0)]
+ )
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
- return encoded_inputs
\ No newline at end of file
+ return encoded_inputs
diff --git a/applications/Chat/coati/models/chatglm/configuration_chatglm.py b/applications/Chat/coati/models/chatglm/configuration_chatglm.py
index d0e3f6cc63d7..a6d2ccd18715 100644
--- a/applications/Chat/coati/models/chatglm/configuration_chatglm.py
+++ b/applications/Chat/coati/models/chatglm/configuration_chatglm.py
@@ -56,30 +56,29 @@ class ChatGLMConfig(PretrainedConfig):
>>> # Accessing the model configuration
>>> configuration = model.config
- ```
-"""
+ ```"""
model_type = "chatglm"
def __init__(
- self,
- vocab_size=130528,
- hidden_size=4096,
- num_layers=28,
- num_attention_heads=32,
- layernorm_epsilon=1e-5,
- use_cache=True,
- bos_token_id=130004,
- eos_token_id=130005,
- mask_token_id=130000,
- gmask_token_id=130001,
- pad_token_id=3,
- max_sequence_length=2048,
- inner_hidden_size=16384,
- position_encoding_2d=True,
- quantization_bit=0,
- pre_seq_len=None,
- prefix_projection=False,
- **kwargs
+ self,
+ vocab_size=130528,
+ hidden_size=4096,
+ num_layers=28,
+ num_attention_heads=32,
+ layernorm_epsilon=1e-5,
+ use_cache=True,
+ bos_token_id=130004,
+ eos_token_id=130005,
+ mask_token_id=130000,
+ gmask_token_id=130001,
+ pad_token_id=3,
+ max_sequence_length=2048,
+ inner_hidden_size=16384,
+ position_encoding_2d=True,
+ quantization_bit=0,
+ pre_seq_len=None,
+ prefix_projection=False,
+ **kwargs,
):
self.num_layers = num_layers
self.vocab_size = vocab_size
@@ -99,9 +98,4 @@ def __init__(
self.pre_seq_len = pre_seq_len
self.prefix_projection = prefix_projection
- super().__init__(
- pad_token_id=pad_token_id,
- bos_token_id=bos_token_id,
- eos_token_id=eos_token_id,
- **kwargs
- )
\ No newline at end of file
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/applications/Chat/coati/models/chatglm/modeling_chatglm.py b/applications/Chat/coati/models/chatglm/modeling_chatglm.py
index 77e7d0d8ea09..d1d15c68ffd8 100644
--- a/applications/Chat/coati/models/chatglm/modeling_chatglm.py
+++ b/applications/Chat/coati/models/chatglm/modeling_chatglm.py
@@ -4,41 +4,40 @@
""" PyTorch ChatGLM model. """
-import math
import copy
+import math
import os
-import warnings
import re
import sys
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
-import torch.utils.checkpoint
import torch.nn.functional as F
+import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss, LayerNorm
from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-
-from transformers.utils import (
- add_code_sample_docstrings,
- add_start_docstrings,
- add_start_docstrings_to_model_forward,
-)
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import GenerationConfig, LogitsProcessorList, ModelOutput, StoppingCriteriaList
from transformers.modeling_outputs import (
BaseModelOutputWithPast,
- CausalLMOutputWithPast,
BaseModelOutputWithPastAndCrossAttentions,
+ CausalLMOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+from transformers.utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+)
from .configuration_chatglm import ChatGLMConfig
# flags required to enable jit fusion kernels
-if sys.platform != 'darwin':
+if sys.platform != "darwin":
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
torch._C._jit_override_can_fuse_on_cpu(True)
@@ -93,8 +92,8 @@ def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(
- n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
- for n in name
+ n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+ for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
@@ -127,7 +126,7 @@ def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
array = np.transpose(array)
try:
assert (
- pointer.shape == array.shape
+ pointer.shape == array.shape
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
except AssertionError as e:
e.args += (pointer.shape, array.shape)
@@ -153,7 +152,7 @@ def __init__(self, config):
self.trans = torch.nn.Sequential(
torch.nn.Linear(config.hidden_size, config.hidden_size),
torch.nn.Tanh(),
- torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2)
+ torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2),
)
else:
self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2)
@@ -170,8 +169,7 @@ def forward(self, prefix: torch.Tensor):
@torch.jit.script
def gelu_impl(x):
"""OpenAI's gelu implementation."""
- return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
- (1.0 + 0.044715 * x * x)))
+ return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
def gelu(x):
@@ -181,21 +179,22 @@ def gelu(x):
class RotaryEmbedding(torch.nn.Module):
def __init__(self, dim, base=10000, precision=torch.half, learnable=False):
super().__init__()
- inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
inv_freq = inv_freq.half()
self.learnable = learnable
if learnable:
self.inv_freq = torch.nn.Parameter(inv_freq)
self.max_seq_len_cached = None
else:
- self.register_buffer('inv_freq', inv_freq)
+ self.register_buffer("inv_freq", inv_freq)
self.max_seq_len_cached = None
self.cos_cached = None
self.sin_cached = None
self.precision = precision
- def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
- error_msgs):
+ def _load_from_state_dict(
+ self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+ ):
pass
def forward(self, x, seq_dim=1, seq_len=None):
@@ -204,7 +203,7 @@ def forward(self, x, seq_dim=1, seq_len=None):
if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
self.max_seq_len_cached = None if self.learnable else seq_len
t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
- freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
if self.precision == torch.bfloat16:
@@ -230,30 +229,31 @@ def _apply(self, fn):
def rotate_half(x):
- x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+ x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=x1.ndim - 1) # dim=-1 triggers a bug in earlier torch versions
@torch.jit.script
def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
# position_id: [sq, b], q, k: [sq, b, np, hn], cos: [sq, 1, hn] -> [sq, b, 1, hn]
- cos, sin = F.embedding(position_id, cos.squeeze(1)).unsqueeze(2), \
- F.embedding(position_id, sin.squeeze(1)).unsqueeze(2)
+ cos, sin = F.embedding(position_id, cos.squeeze(1)).unsqueeze(2), F.embedding(
+ position_id, sin.squeeze(1)
+ ).unsqueeze(2)
q, k = (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
return q, k
def attention_fn(
- self,
- query_layer,
- key_layer,
- value_layer,
- attention_mask,
- hidden_size_per_partition,
- layer_id,
- layer_past=None,
- scaling_attention_score=True,
- use_cache=False,
+ self,
+ query_layer,
+ key_layer,
+ value_layer,
+ attention_mask,
+ hidden_size_per_partition,
+ layer_id,
+ layer_past=None,
+ scaling_attention_score=True,
+ use_cache=False,
):
if layer_past is not None:
past_key, past_value = layer_past[0], layer_past[1]
@@ -285,7 +285,9 @@ def attention_fn(
key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
matmul_result = torch.zeros(
- 1, 1, 1,
+ 1,
+ 1,
+ 1,
dtype=query_layer.dtype,
device=query_layer.device,
)
@@ -355,9 +357,17 @@ def default_init(cls, *args, **kwargs):
class SelfAttention(torch.nn.Module):
- def __init__(self, hidden_size, num_attention_heads,
- layer_id, hidden_size_per_attention_head=None, bias=True,
- params_dtype=torch.float, position_encoding_2d=True, empty_init=True):
+ def __init__(
+ self,
+ hidden_size,
+ num_attention_heads,
+ layer_id,
+ hidden_size_per_attention_head=None,
+ bias=True,
+ params_dtype=torch.float,
+ position_encoding_2d=True,
+ empty_init=True,
+ ):
if empty_init:
init_method = skip_init
else:
@@ -410,8 +420,7 @@ def attention_mask_func(attention_scores, attention_mask):
attention_scores.masked_fill_(attention_mask, -10000.0)
return attention_scores
- def split_tensor_along_last_dim(self, tensor, num_partitions,
- contiguous_split_chunks=False):
+ def split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_chunks=False):
"""Split a tensor along its last dimension.
Arguments:
tensor: input tensor.
@@ -431,14 +440,14 @@ def split_tensor_along_last_dim(self, tensor, num_partitions,
return tensor_list
def forward(
- self,
- hidden_states: torch.Tensor,
- position_ids,
- attention_mask: torch.Tensor,
- layer_id,
- layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
- use_cache: bool = False,
- output_attentions: bool = False,
+ self,
+ hidden_states: torch.Tensor,
+ position_ids,
+ attention_mask: torch.Tensor,
+ layer_id,
+ layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+ use_cache: bool = False,
+ output_attentions: bool = False,
):
"""
hidden_states: [seq_len, batch, hidden_size]
@@ -462,8 +471,10 @@ def forward(
q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1)
- position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \
- position_ids[:, 1, :].transpose(0, 1).contiguous()
+ position_ids, block_position_ids = (
+ position_ids[:, 0, :].transpose(0, 1).contiguous(),
+ position_ids[:, 1, :].transpose(0, 1).contiguous(),
+ )
q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids)
q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids)
query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1))
@@ -484,7 +495,7 @@ def forward(
hidden_size_per_partition=self.hidden_size_per_partition,
layer_id=layer_id,
layer_past=layer_past,
- use_cache=use_cache
+ use_cache=use_cache,
)
output = self.dense(context_layer)
@@ -509,8 +520,16 @@ def forward(self, x):
class GLU(torch.nn.Module):
- def __init__(self, hidden_size, inner_hidden_size=None,
- layer_id=None, bias=True, activation_func=gelu, params_dtype=torch.float, empty_init=True):
+ def __init__(
+ self,
+ hidden_size,
+ inner_hidden_size=None,
+ layer_id=None,
+ bias=True,
+ activation_func=gelu,
+ params_dtype=torch.float,
+ empty_init=True,
+ ):
super(GLU, self).__init__()
if empty_init:
init_method = skip_init
@@ -557,19 +576,19 @@ def forward(self, hidden_states):
class GLMBlock(torch.nn.Module):
def __init__(
- self,
- hidden_size,
- num_attention_heads,
- layernorm_epsilon,
- layer_id,
- inner_hidden_size=None,
- hidden_size_per_attention_head=None,
- layernorm=LayerNorm,
- use_bias=True,
- params_dtype=torch.float,
- num_layers=28,
- position_encoding_2d=True,
- empty_init=True
+ self,
+ hidden_size,
+ num_attention_heads,
+ layernorm_epsilon,
+ layer_id,
+ inner_hidden_size=None,
+ hidden_size_per_attention_head=None,
+ layernorm=LayerNorm,
+ use_bias=True,
+ params_dtype=torch.float,
+ num_layers=28,
+ position_encoding_2d=True,
+ empty_init=True,
):
super(GLMBlock, self).__init__()
# Set output layer initialization if not provided.
@@ -590,7 +609,7 @@ def __init__(
bias=use_bias,
params_dtype=params_dtype,
position_encoding_2d=self.position_encoding_2d,
- empty_init=empty_init
+ empty_init=empty_init,
)
# Layernorm on the input data.
@@ -605,18 +624,18 @@ def __init__(
bias=use_bias,
layer_id=layer_id,
params_dtype=params_dtype,
- empty_init=empty_init
+ empty_init=empty_init,
)
def forward(
- self,
- hidden_states: torch.Tensor,
- position_ids,
- attention_mask: torch.Tensor,
- layer_id,
- layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
- use_cache: bool = False,
- output_attentions: bool = False,
+ self,
+ hidden_states: torch.Tensor,
+ position_ids,
+ attention_mask: torch.Tensor,
+ layer_id,
+ layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+ use_cache: bool = False,
+ output_attentions: bool = False,
):
"""
hidden_states: [seq_len, batch, hidden_size]
@@ -635,7 +654,7 @@ def forward(
layer_id=layer_id,
layer_past=layer_past,
use_cache=use_cache,
- output_attentions=output_attentions
+ output_attentions=output_attentions,
)
attention_output = attention_outputs[0]
@@ -702,10 +721,15 @@ def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None):
position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
for i, context_length in enumerate(context_lengths):
position_ids[i, context_length:] = mask_positions[i]
- block_position_ids = [torch.cat((
- torch.zeros(context_length, dtype=torch.long, device=device),
- torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
- )) for context_length in context_lengths]
+ block_position_ids = [
+ torch.cat(
+ (
+ torch.zeros(context_length, dtype=torch.long, device=device),
+ torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1,
+ )
+ )
+ for context_length in context_lengths
+ ]
block_position_ids = torch.stack(block_position_ids, dim=0)
position_ids = torch.stack((position_ids, block_position_ids), dim=1)
else:
@@ -823,9 +847,7 @@ def __init__(self, config: ChatGLMConfig, empty_init=True):
self.prefix_projection = config.prefix_projection
self.word_embeddings = init_method(
- torch.nn.Embedding,
- num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
- dtype=self.params_dtype
+ torch.nn.Embedding, num_embeddings=self.vocab_size, embedding_dim=self.hidden_size, dtype=self.params_dtype
)
self.gradient_checkpointing = False
@@ -841,12 +863,10 @@ def get_layer(layer_id):
use_bias=True,
params_dtype=self.params_dtype,
position_encoding_2d=self.position_encoding_2d,
- empty_init=empty_init
+ empty_init=empty_init,
)
- self.layers = torch.nn.ModuleList(
- [get_layer(layer_id) for layer_id in range(self.num_layers)]
- )
+ self.layers = torch.nn.ModuleList([get_layer(layer_id) for layer_id in range(self.num_layers)])
# Final layer norm before output.
self.final_layernorm = LayerNorm(self.hidden_size, eps=self.layernorm_epsilon)
@@ -876,7 +896,7 @@ def get_prompt(self, batch_size, device, dtype=torch.half):
self.pre_seq_len,
self.num_layers * 2,
self.num_attention_heads,
- self.hidden_size // self.num_attention_heads
+ self.hidden_size // self.num_attention_heads,
)
# seq_len, b, nh, hidden_size
past_key_values = self.dropout(past_key_values)
@@ -891,18 +911,17 @@ def get_prompt(self, batch_size, device, dtype=torch.half):
config_class=_CONFIG_FOR_DOC,
)
def forward(
- self,
- input_ids: Optional[torch.LongTensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
- inputs_embeds: Optional[torch.LongTensor] = None,
- use_cache: Optional[bool] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+ inputs_embeds: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPast]:
-
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -931,17 +950,14 @@ def forward(
if past_key_values is None:
if self.pre_seq_len is not None:
- past_key_values = self.get_prompt(batch_size=input_ids.shape[0], device=input_ids.device,
- dtype=inputs_embeds.dtype)
+ past_key_values = self.get_prompt(
+ batch_size=input_ids.shape[0], device=input_ids.device, dtype=inputs_embeds.dtype
+ )
else:
past_key_values = tuple([None] * len(self.layers))
if attention_mask is None:
- attention_mask = self.get_masks(
- input_ids,
- device=input_ids.device
- )
-
+ attention_mask = self.get_masks(input_ids, device=input_ids.device)
if position_ids is None:
MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
@@ -955,15 +971,13 @@ def forward(
use_gmasks.append(use_gmask)
position_ids = self.get_position_ids(
- input_ids,
- mask_positions=mask_positions,
- device=input_ids.device,
- use_gmasks=use_gmasks
+ input_ids, mask_positions=mask_positions, device=input_ids.device, use_gmasks=use_gmasks
)
if self.pre_seq_len is not None and attention_mask is not None:
prefix_attention_mask = torch.ones(batch_size, 1, input_ids.size(-1), self.pre_seq_len).to(
- attention_mask.device)
+ attention_mask.device
+ )
prefix_attention_mask = (prefix_attention_mask < 0.5).bool()
attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=3)
@@ -980,7 +994,6 @@ def forward(
attention_mask = attention_mask.to(hidden_states.device)
for i, layer in enumerate(self.layers):
-
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_past = past_key_values[i]
@@ -994,7 +1007,7 @@ def forward(
torch.tensor(i),
layer_past,
use_cache,
- output_attentions
+ output_attentions,
)
else:
layer_ret = layer(
@@ -1004,7 +1017,7 @@ def forward(
layer_id=torch.tensor(i),
layer_past=layer_past,
use_cache=use_cache,
- output_attentions=output_attentions
+ output_attentions=output_attentions,
)
hidden_states = layer_ret[0]
@@ -1049,13 +1062,7 @@ def __init__(self, config: ChatGLMConfig, empty_init=True):
self.transformer = ChatGLMModel(config, empty_init=empty_init)
- self.lm_head = init_method(
- nn.Linear,
- config.hidden_size,
- config.vocab_size,
- bias=False,
- dtype=torch.half
- )
+ self.lm_head = init_method(nn.Linear, config.hidden_size, config.vocab_size, bias=False, dtype=torch.half)
self.config = config
@@ -1087,32 +1094,29 @@ def _update_model_kwargs_for_generation(
attention_mask = model_kwargs["attention_mask"]
if attention_mask is not None and attention_mask.dtype == torch.bool:
attention_mask = torch.cat(
- [attention_mask, attention_mask.new_ones((*attention_mask.shape[:3], 1))], dim=3)
+ [attention_mask, attention_mask.new_ones((*attention_mask.shape[:3], 1))], dim=3
+ )
new_attention_mask = attention_mask[:, :, -1:].clone()
new_attention_mask[..., -1] = False
- model_kwargs["attention_mask"] = torch.cat(
- [attention_mask, new_attention_mask], dim=2
- )
+ model_kwargs["attention_mask"] = torch.cat([attention_mask, new_attention_mask], dim=2)
# update position ids
if "position_ids" in model_kwargs:
position_ids = model_kwargs["position_ids"]
new_position_id = position_ids[..., -1:].clone()
new_position_id[:, 1, :] += 1
- model_kwargs["position_ids"] = torch.cat(
- [position_ids, new_position_id], dim=-1
- )
+ model_kwargs["position_ids"] = torch.cat([position_ids, new_position_id], dim=-1)
return model_kwargs
def prepare_inputs_for_generation(
- self,
- input_ids: torch.LongTensor,
- past: Optional[torch.Tensor] = None,
- past_key_values: Optional[torch.Tensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.Tensor] = None,
- **kwargs
+ self,
+ input_ids: torch.LongTensor,
+ past: Optional[torch.Tensor] = None,
+ past_key_values: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ **kwargs,
) -> dict:
batch_size, seq_length = input_ids.shape
MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
@@ -1137,11 +1141,17 @@ def prepare_inputs_for_generation(
context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs]
if self.position_encoding_2d:
position_ids = torch.tensor(
- [[mask_position, seq_length - context_length] for mask_position, context_length in
- zip(mask_positions, context_lengths)], dtype=torch.long, device=input_ids.device).unsqueeze(-1)
+ [
+ [mask_position, seq_length - context_length]
+ for mask_position, context_length in zip(mask_positions, context_lengths)
+ ],
+ dtype=torch.long,
+ device=input_ids.device,
+ ).unsqueeze(-1)
else:
- position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long,
- device=input_ids.device).unsqueeze(-1)
+ position_ids = torch.tensor(
+ [mask_position for mask_position in mask_positions], dtype=torch.long, device=input_ids.device
+ ).unsqueeze(-1)
if past is None:
past = past_key_values
@@ -1149,44 +1159,38 @@ def prepare_inputs_for_generation(
"input_ids": last_token,
"past_key_values": past,
"position_ids": position_ids,
- "attention_mask": attention_mask
+ "attention_mask": attention_mask,
}
else:
if attention_mask is not None and attention_mask.dtype != torch.bool:
logger.warning_once(f"The dtype of attention mask ({attention_mask.dtype}) is not bool")
attention_mask = None
if attention_mask is None:
- attention_mask = self.get_masks(
- input_ids,
- device=input_ids.device
- )
+ attention_mask = self.get_masks(input_ids, device=input_ids.device)
if position_ids is None:
position_ids = self.get_position_ids(
- input_ids,
- device=input_ids.device,
- mask_positions=mask_positions,
- use_gmasks=use_gmasks
+ input_ids, device=input_ids.device, mask_positions=mask_positions, use_gmasks=use_gmasks
)
return {
"input_ids": input_ids,
"past_key_values": past,
"position_ids": position_ids,
- "attention_mask": attention_mask
+ "attention_mask": attention_mask,
}
def forward(
- self,
- input_ids: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.Tensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
- inputs_embeds: Optional[torch.Tensor] = None,
- labels: Optional[torch.Tensor] = None,
- use_cache: Optional[bool] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
):
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1235,7 +1239,7 @@ def forward(
@staticmethod
def _reorder_cache(
- past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+ past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
@@ -1268,15 +1272,33 @@ def process_response(self, response):
return response
@torch.no_grad()
- def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
- do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+ def chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = None,
+ max_length: int = 2048,
+ num_beams=1,
+ do_sample=True,
+ top_p=0.7,
+ temperature=0.95,
+ logits_processor=None,
+ **kwargs,
+ ):
if history is None:
history = []
if logits_processor is None:
logits_processor = LogitsProcessorList()
logits_processor.append(InvalidScoreLogitsProcessor())
- gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
- "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+ gen_kwargs = {
+ "max_length": max_length,
+ "num_beams": num_beams,
+ "do_sample": do_sample,
+ "top_p": top_p,
+ "temperature": temperature,
+ "logits_processor": logits_processor,
+ **kwargs,
+ }
if not history:
prompt = query
else:
@@ -1287,22 +1309,38 @@ def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max
inputs = tokenizer([prompt], return_tensors="pt")
inputs = inputs.to(self.device)
outputs = self.generate(**inputs, **gen_kwargs)
- outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+ outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
response = tokenizer.decode(outputs)
response = self.process_response(response)
history = history + [(query, response)]
return response, history
@torch.no_grad()
- def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,
- do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+ def stream_chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = None,
+ max_length: int = 2048,
+ do_sample=True,
+ top_p=0.7,
+ temperature=0.95,
+ logits_processor=None,
+ **kwargs,
+ ):
if history is None:
history = []
if logits_processor is None:
logits_processor = LogitsProcessorList()
logits_processor.append(InvalidScoreLogitsProcessor())
- gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
- "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+ gen_kwargs = {
+ "max_length": max_length,
+ "do_sample": do_sample,
+ "top_p": top_p,
+ "temperature": temperature,
+ "logits_processor": logits_processor,
+ **kwargs,
+ }
if not history:
prompt = query
else:
@@ -1313,7 +1351,7 @@ def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = No
inputs = tokenizer([prompt], return_tensors="pt")
inputs = inputs.to(self.device)
for outputs in self.stream_generate(**inputs, **gen_kwargs):
- outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+ outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
response = tokenizer.decode(outputs)
response = self.process_response(response)
new_history = history + [(query, response)]
@@ -1321,13 +1359,13 @@ def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = No
@torch.no_grad()
def stream_generate(
- self,
- input_ids,
- generation_config: Optional[GenerationConfig] = None,
- logits_processor: Optional[LogitsProcessorList] = None,
- stopping_criteria: Optional[StoppingCriteriaList] = None,
- prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
- **kwargs,
+ self,
+ input_ids,
+ generation_config: Optional[GenerationConfig] = None,
+ logits_processor: Optional[LogitsProcessorList] = None,
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
+ prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+ **kwargs,
):
batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py
index de0d63f95f50..4ab0cdc8a3ea 100644
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@@ -2,6 +2,7 @@
import torch
import torch.distributed as dist
+from transformers import PreTrainedTokenizer
from .base import Actor
@@ -16,9 +17,9 @@
from transformers.generation import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper
-def _prepare_logits_processor(top_k: Optional[int] = None,
- top_p: Optional[float] = None,
- temperature: Optional[float] = None) -> LogitsProcessorList:
+def _prepare_logits_processor(
+ top_k: Optional[int] = None, top_p: Optional[float] = None, temperature: Optional[float] = None
+) -> LogitsProcessorList:
processor_list = LogitsProcessorList()
if temperature is not None and temperature != 1.0:
processor_list.append(TemperatureLogitsWarper(temperature))
@@ -37,18 +38,20 @@ def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
return unfinished_sequences.max() == 0
-def _sample(model: Actor,
- input_ids: torch.Tensor,
- max_length: int,
- early_stopping: bool = False,
- eos_token_id: Optional[int] = None,
- pad_token_id: Optional[int] = None,
- top_k: Optional[int] = None,
- top_p: Optional[float] = None,
- temperature: Optional[float] = None,
- prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
- update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
- **model_kwargs) -> torch.Tensor:
+def _sample(
+ model: Actor,
+ input_ids: torch.Tensor,
+ max_length: int,
+ early_stopping: bool = False,
+ eos_token_id: Optional[int] = None,
+ pad_token_id: Optional[int] = None,
+ top_k: Optional[int] = None,
+ top_p: Optional[float] = None,
+ temperature: Optional[float] = None,
+ prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
+ update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+ **model_kwargs,
+) -> torch.Tensor:
if input_ids.size(1) >= max_length:
return input_ids
@@ -56,12 +59,13 @@ def _sample(model: Actor,
unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
for _ in range(input_ids.size(1), max_length):
- model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) \
- if prepare_inputs_fn is not None else {'input_ids': input_ids}
+ model_inputs = (
+ prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {"input_ids": input_ids}
+ )
outputs = model(**model_inputs)
- next_token_logits = outputs['logits'][:, -1, :]
- # pre-process distribution
+ # NOTE: this is correct only in left padding mode
+ next_token_logits = outputs["logits"][:, -1, :]
next_token_logits = logits_processor(input_ids, next_token_logits)
# sample
probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float)
@@ -69,8 +73,7 @@ def _sample(model: Actor,
# finished sentences should have their next token be a padding token
if eos_token_id is not None:
- if pad_token_id is None:
- raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+ assert pad_token_id is not None, "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
# update generated ids, model inputs for next step
@@ -90,20 +93,21 @@ def _sample(model: Actor,
@torch.no_grad()
-def generate(model: Actor,
- input_ids: torch.Tensor,
- max_length: int,
- num_beams: int = 1,
- do_sample: bool = True,
- early_stopping: bool = False,
- eos_token_id: Optional[int] = None,
- pad_token_id: Optional[int] = None,
- top_k: Optional[int] = None,
- top_p: Optional[float] = None,
- temperature: Optional[float] = None,
- prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
- update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
- **model_kwargs) -> torch.Tensor:
+def generate(
+ model: Actor,
+ input_ids: torch.Tensor,
+ tokenizer: PreTrainedTokenizer,
+ max_length: int,
+ num_beams: int = 1,
+ do_sample: bool = True,
+ early_stopping: bool = False,
+ top_k: Optional[int] = None,
+ top_p: Optional[float] = None,
+ temperature: Optional[float] = None,
+ prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
+ update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+ **model_kwargs,
+) -> torch.Tensor:
"""Generate token sequence. The returned sequence is input_ids + generated_tokens.
Args:
@@ -113,34 +117,35 @@ def generate(model: Actor,
num_beams (int, optional): number of beams. Defaults to 1.
do_sample (bool, optional): whether to do sample. Defaults to True.
early_stopping (bool, optional): if True, the sequence length may be smaller than max_length due to finding eos. Defaults to False.
- eos_token_id (Optional[int], optional): end of sequence token id. Defaults to None.
- pad_token_id (Optional[int], optional): pad token id. Defaults to None.
top_k (Optional[int], optional): the number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to None.
top_p (Optional[float], optional): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. Defaults to None.
temperature (Optional[float], optional): The value used to module the next token probabilities. Defaults to None.
prepare_inputs_fn (Optional[Callable[[torch.Tensor, Any], dict]], optional): Function to preprocess model inputs. Arguments of this function should be input_ids and model_kwargs. Defaults to None.
update_model_kwargs_fn (Optional[Callable[[dict, Any], dict]], optional): Function to update model_kwargs based on outputs. Arguments of this function should be outputs and model_kwargs. Defaults to None.
"""
- is_greedy_gen_mode = ((num_beams == 1) and do_sample is False)
- is_sample_gen_mode = ((num_beams == 1) and do_sample is True)
- is_beam_gen_mode = ((num_beams > 1) and do_sample is False)
+ assert tokenizer.padding_side == "left", "Current generation only supports left padding."
+ is_greedy_gen_mode = (num_beams == 1) and do_sample is False
+ is_sample_gen_mode = (num_beams == 1) and do_sample is True
+ is_beam_gen_mode = (num_beams > 1) and do_sample is False
if is_greedy_gen_mode:
# run greedy search
raise NotImplementedError
elif is_sample_gen_mode:
# run sample
- return _sample(model,
- input_ids,
- max_length,
- early_stopping=early_stopping,
- eos_token_id=eos_token_id,
- pad_token_id=pad_token_id,
- top_k=top_k,
- top_p=top_p,
- temperature=temperature,
- prepare_inputs_fn=prepare_inputs_fn,
- update_model_kwargs_fn=update_model_kwargs_fn,
- **model_kwargs)
+ return _sample(
+ model,
+ input_ids,
+ max_length,
+ early_stopping=early_stopping,
+ eos_token_id=tokenizer.eos_token_id,
+ pad_token_id=tokenizer.pad_token_id,
+ top_k=top_k,
+ top_p=top_p,
+ temperature=temperature,
+ prepare_inputs_fn=prepare_inputs_fn,
+ update_model_kwargs_fn=update_model_kwargs_fn,
+ **model_kwargs,
+ )
elif is_beam_gen_mode:
raise NotImplementedError
else:
diff --git a/applications/Chat/coati/models/gpt/__init__.py b/applications/Chat/coati/models/gpt/__init__.py
index 63dc5ab0f5ea..823cf4a75e0d 100644
--- a/applications/Chat/coati/models/gpt/__init__.py
+++ b/applications/Chat/coati/models/gpt/__init__.py
@@ -2,4 +2,4 @@
from .gpt_critic import GPTCritic
from .gpt_rm import GPTRM
-__all__ = ['GPTActor', 'GPTCritic', 'GPTRM']
+__all__ = ["GPTActor", "GPTCritic", "GPTRM"]
diff --git a/applications/Chat/coati/models/gpt/gpt_actor.py b/applications/Chat/coati/models/gpt/gpt_actor.py
index ae9d669f1f56..a7e4b9bc3e22 100644
--- a/applications/Chat/coati/models/gpt/gpt_actor.py
+++ b/applications/Chat/coati/models/gpt/gpt_actor.py
@@ -18,13 +18,15 @@ class GPTActor(Actor):
lora_train_bias (str): Bias training strategy for the LoRa layer.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[GPT2Config] = None,
- checkpoint: bool = False,
- lora_rank: int = 0,
- lora_train_bias: str = 'none',
- **kwargs) -> None:
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[GPT2Config] = None,
+ checkpoint: bool = False,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ **kwargs,
+ ) -> None:
if pretrained is not None:
model = GPT2LMHeadModel.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/gpt/gpt_critic.py b/applications/Chat/coati/models/gpt/gpt_critic.py
index 01e1cd10ef57..22ab36dea276 100644
--- a/applications/Chat/coati/models/gpt/gpt_critic.py
+++ b/applications/Chat/coati/models/gpt/gpt_critic.py
@@ -18,12 +18,14 @@ class GPTCritic(Critic):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[GPT2Config] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none',
- **kwargs) -> None:
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[GPT2Config] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ **kwargs,
+ ) -> None:
if pretrained is not None:
model = GPT2Model.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/gpt/gpt_rm.py b/applications/Chat/coati/models/gpt/gpt_rm.py
index e52a5a14c1da..8edfc4008466 100644
--- a/applications/Chat/coati/models/gpt/gpt_rm.py
+++ b/applications/Chat/coati/models/gpt/gpt_rm.py
@@ -18,11 +18,13 @@ class GPTRM(RewardModel):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[GPT2Config] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none') -> None:
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[GPT2Config] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ ) -> None:
if pretrained is not None:
model = GPT2Model.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/llama/__init__.py b/applications/Chat/coati/models/llama/__init__.py
index 9b2a024afdb2..c87d732538a9 100644
--- a/applications/Chat/coati/models/llama/__init__.py
+++ b/applications/Chat/coati/models/llama/__init__.py
@@ -2,4 +2,4 @@
from .llama_critic import LlamaCritic
from .llama_rm import LlamaRM
-__all__ = ['LlamaActor', 'LlamaCritic', 'LlamaRM']
+__all__ = ["LlamaActor", "LlamaCritic", "LlamaRM"]
diff --git a/applications/Chat/coati/models/llama/llama_actor.py b/applications/Chat/coati/models/llama/llama_actor.py
index 2c7adb390d8b..f1d9406835ca 100644
--- a/applications/Chat/coati/models/llama/llama_actor.py
+++ b/applications/Chat/coati/models/llama/llama_actor.py
@@ -1,7 +1,6 @@
from typing import Optional
-import torch
-from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM
+from transformers import LlamaConfig, LlamaForCausalLM
from ..base import Actor
@@ -18,13 +17,14 @@ class LlamaActor(Actor):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[LlamaConfig] = None,
- checkpoint: bool = False,
- lora_rank: int = 0,
- lora_train_bias: str = 'none') -> None:
-
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[LlamaConfig] = None,
+ checkpoint: bool = False,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ ) -> None:
if pretrained is not None:
model = LlamaForCausalLM.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/llama/llama_critic.py b/applications/Chat/coati/models/llama/llama_critic.py
index a67e5de5def6..000dce17ccf0 100644
--- a/applications/Chat/coati/models/llama/llama_critic.py
+++ b/applications/Chat/coati/models/llama/llama_critic.py
@@ -17,13 +17,14 @@ class LlamaCritic(Critic):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[LlamaConfig] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none',
- **kwargs) -> None:
-
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[LlamaConfig] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ **kwargs,
+ ) -> None:
if pretrained is not None:
model = LlamaModel.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/llama/llama_rm.py b/applications/Chat/coati/models/llama/llama_rm.py
index d6b62922686e..43bc9e638dc7 100644
--- a/applications/Chat/coati/models/llama/llama_rm.py
+++ b/applications/Chat/coati/models/llama/llama_rm.py
@@ -1,7 +1,7 @@
from typing import Optional
import torch.nn as nn
-from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+from transformers import LlamaConfig, LlamaModel
from ..base import RewardModel
@@ -17,12 +17,13 @@ class LlamaRM(RewardModel):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[LlamaConfig] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none') -> None:
-
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[LlamaConfig] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ ) -> None:
if pretrained is not None:
model = LlamaModel.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
index f1597da540a7..e9bd7b2ed8f0 100644
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -1,4 +1,6 @@
+import dataclasses
import math
+import warnings
from typing import Optional
import loralib as lora
@@ -7,9 +9,16 @@
import torch.nn.functional as F
+@dataclasses.dataclass
+class LoRAManager:
+ merge_weights: bool = False
+
+
+LORA_MANAGER = LoRAManager()
+
+
class LoraLinear(lora.LoRALayer, nn.Module):
- """Replace in-place ops to out-of-place ops to fit gemini. Convert a torch.nn.Linear to LoraLinear.
- """
+ """Replace in-place ops to out-of-place ops to fit gemini. Convert a torch.nn.Linear to LoraLinear."""
def __init__(
self,
@@ -17,16 +26,12 @@ def __init__(
bias: Optional[nn.Parameter],
r: int = 0,
lora_alpha: int = 1,
- lora_dropout: float = 0.,
- fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
- merge_weights: bool = True,
+ lora_dropout: float = 0.0,
+ # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+ fan_in_fan_out: bool = False,
):
nn.Module.__init__(self)
- lora.LoRALayer.__init__(self,
- r=r,
- lora_alpha=lora_alpha,
- lora_dropout=lora_dropout,
- merge_weights=merge_weights)
+ lora.LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=False)
self.weight = weight
self.bias = bias
@@ -47,45 +52,42 @@ def __init__(
self.weight.data = self.weight.data.T
def reset_parameters(self):
- if hasattr(self, 'lora_A'):
+ if hasattr(self, "lora_A"):
# Initialize A with the default values for nn.Linear and set B to zero.
nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
nn.init.zeros_(self.lora_B)
def train(self, mode: bool = True):
-
- def T(w):
- return w.T if self.fan_in_fan_out else w
-
- nn.Module.train(self, mode)
- if self.merge_weights and self.merged:
- # Make sure that the weights are not merged
- if self.r > 0:
- if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"):
- # FIXME(csric): temporary fix
- self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features)))
- self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r)))
- self.reset_parameters()
- else:
- self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
- self.merged = False
-
- def eval(self):
-
def T(w):
return w.T if self.fan_in_fan_out else w
- nn.Module.eval(self)
- if self.merge_weights and not self.merged:
- # Merge the weights and mark it
- if self.r > 0:
- self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
- delattr(self, 'lora_A')
- delattr(self, 'lora_B')
- self.merged = True
+ self.training = mode
+ if LORA_MANAGER.merge_weights:
+ if mode and self.merged:
+ warnings.warn("Invoke module.train() would unmerge LoRA weights.")
+ raise NotImplementedError("LoRA unmerge is not tested.")
+ # Make sure that the weights are not merged
+ if self.r > 0:
+ if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"):
+ # FIXME(csric): temporary fix
+ self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features)))
+ self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r)))
+ self.reset_parameters()
+ else:
+ self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+ self.merged = False
+ elif not mode and not self.merged:
+ warnings.warn("Invoke module.eval() would merge LoRA weights.")
+ # Merge the weights and mark it
+ if self.r > 0:
+ self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
+ delattr(self, "lora_A")
+ delattr(self, "lora_B")
+ self.merged = True
+
+ return self
def forward(self, x: torch.Tensor):
-
def T(w):
return w.T if self.fan_in_fan_out else w
@@ -99,8 +101,10 @@ def T(w):
def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int) -> LoraLinear:
- assert lora_rank <= linear.in_features, f'LoRA rank ({lora_rank}) must be less than or equal to in features ({linear.in_features})'
- lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank, merge_weights=False)
+ assert (
+ lora_rank <= linear.in_features
+ ), f"LoRA rank ({lora_rank}) must be less than or equal to in features ({linear.in_features})"
+ lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank)
return lora_linear
@@ -112,7 +116,7 @@ def _convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None:
_convert_to_lora_recursively(child, lora_rank)
-def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = 'none') -> nn.Module:
+def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = "none") -> nn.Module:
"""Convert a torch.nn.Module to a LoRA module.
Args:
@@ -140,7 +144,7 @@ class LoRAModule(nn.Module):
Defaults to 'none'.
"""
- def __init__(self, lora_rank: int = 0, lora_train_bias: str = 'none') -> None:
+ def __init__(self, lora_rank: int = 0, lora_train_bias: str = "none") -> None:
super().__init__()
self.lora_rank = lora_rank
self.lora_train_bias = lora_train_bias
diff --git a/applications/Chat/coati/models/loss.py b/applications/Chat/coati/models/loss.py
index 05a0b4821797..687bd0f7bfe7 100644
--- a/applications/Chat/coati/models/loss.py
+++ b/applications/Chat/coati/models/loss.py
@@ -13,6 +13,7 @@ class GPTLMLoss(nn.Module):
def __init__(self):
super().__init__()
+ # NOTE: default ignore_index is -100, which is equal to IGNORE_INDEX in sft_dataset.py
self.loss = nn.CrossEntropyLoss()
def forward(self, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
@@ -31,11 +32,13 @@ def __init__(self, clip_eps: float = 0.2) -> None:
super().__init__()
self.clip_eps = clip_eps
- def forward(self,
- log_probs: torch.Tensor,
- old_log_probs: torch.Tensor,
- advantages: torch.Tensor,
- action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+ def forward(
+ self,
+ log_probs: torch.Tensor,
+ old_log_probs: torch.Tensor,
+ advantages: torch.Tensor,
+ action_mask: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
ratio = (log_probs - old_log_probs).exp()
surr1 = ratio * advantages
surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
@@ -55,14 +58,16 @@ def __init__(self, clip_eps: float = 0.4) -> None:
super().__init__()
self.clip_eps = clip_eps
- def forward(self,
- values: torch.Tensor,
- old_values: torch.Tensor,
- reward: torch.Tensor,
- action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+ def forward(
+ self,
+ values: torch.Tensor,
+ old_values: torch.Tensor,
+ reward: torch.Tensor,
+ action_mask: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
values_clipped = old_values + (values - old_values).clamp(-self.clip_eps, self.clip_eps)
- surr1 = (values_clipped - reward)**2
- surr2 = (values - reward)**2
+ surr1 = (values_clipped - reward) ** 2
+ surr2 = (values - reward) ** 2
loss = torch.max(surr1, surr2)
loss = loss.mean()
return 0.5 * loss
diff --git a/applications/Chat/coati/models/opt/__init__.py b/applications/Chat/coati/models/opt/__init__.py
index 334f4df0032a..e37d6e45c8fc 100644
--- a/applications/Chat/coati/models/opt/__init__.py
+++ b/applications/Chat/coati/models/opt/__init__.py
@@ -2,4 +2,4 @@
from .opt_critic import OPTCritic
from .opt_rm import OPTRM
-__all__ = ['OPTActor', 'OPTCritic', 'OPTRM']
+__all__ = ["OPTActor", "OPTCritic", "OPTRM"]
diff --git a/applications/Chat/coati/models/opt/opt_actor.py b/applications/Chat/coati/models/opt/opt_actor.py
index c14e4377ffb2..cd8908e13fb8 100644
--- a/applications/Chat/coati/models/opt/opt_actor.py
+++ b/applications/Chat/coati/models/opt/opt_actor.py
@@ -18,12 +18,14 @@ class OPTActor(Actor):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[OPTConfig] = None,
- checkpoint: bool = False,
- lora_rank: int = 0,
- lora_train_bias: str = 'none') -> None:
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[OPTConfig] = None,
+ checkpoint: bool = False,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ ) -> None:
if pretrained is not None:
model = OPTForCausalLM.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/opt/opt_critic.py b/applications/Chat/coati/models/opt/opt_critic.py
index f66c4173fa52..f37d28812c27 100644
--- a/applications/Chat/coati/models/opt/opt_critic.py
+++ b/applications/Chat/coati/models/opt/opt_critic.py
@@ -18,12 +18,14 @@ class OPTCritic(Critic):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[OPTConfig] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none',
- **kwargs) -> None:
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[OPTConfig] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ **kwargs,
+ ) -> None:
if pretrained is not None:
model = OPTModel.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/opt/opt_rm.py b/applications/Chat/coati/models/opt/opt_rm.py
index 6f75344e6aae..893708344ad4 100644
--- a/applications/Chat/coati/models/opt/opt_rm.py
+++ b/applications/Chat/coati/models/opt/opt_rm.py
@@ -17,11 +17,13 @@ class OPTRM(RewardModel):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: Optional[str] = None,
- config: Optional[OPTConfig] = None,
- lora_rank: int = 0,
- lora_train_bias: str = 'none') -> None:
+ def __init__(
+ self,
+ pretrained: Optional[str] = None,
+ config: Optional[OPTConfig] = None,
+ lora_rank: int = 0,
+ lora_train_bias: str = "none",
+ ) -> None:
if pretrained is not None:
model = OPTModel.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/coati/models/utils.py b/applications/Chat/coati/models/utils.py
index 97637d3523b0..1aaef16620d2 100644
--- a/applications/Chat/coati/models/utils.py
+++ b/applications/Chat/coati/models/utils.py
@@ -4,9 +4,9 @@
import torch.nn.functional as F
-def _compute_approx_kl(log_probs: torch.Tensor,
- log_probs_base: torch.Tensor,
- action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+def _compute_approx_kl(
+ log_probs: torch.Tensor, log_probs_base: torch.Tensor, action_mask: Optional[torch.Tensor] = None
+) -> torch.Tensor:
"""
Compute the approximate KL divergence between two distributions.
Schulman blog: http://joschu.net/blog/kl-approx.html
@@ -26,11 +26,13 @@ def _compute_approx_kl(log_probs: torch.Tensor,
return approx_kl
-def compute_reward(r: Union[torch.Tensor, float],
- kl_coef: float,
- log_probs: torch.Tensor,
- log_probs_base: torch.Tensor,
- action_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+def compute_reward(
+ r: Union[torch.Tensor, float],
+ kl_coef: float,
+ log_probs: torch.Tensor,
+ log_probs_base: torch.Tensor,
+ action_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
if kl_coef <= 0.0:
return r
kl = _compute_approx_kl(log_probs, log_probs_base, action_mask=action_mask)
@@ -44,18 +46,17 @@ def _log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.
return log_probs_labels.squeeze(-1)
-def calc_action_log_probs(output: torch.Tensor, sequences: torch.LongTensor, num_actions: int) -> torch.Tensor:
+def calc_action_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, num_actions: int) -> torch.Tensor:
"""Calculate action log probs.
Args:
- output (torch.Tensor): Output tensor of Actor.forward.
+ output (torch.Tensor): Output tensor of Actor.forward.logits.
sequences (torch.LongTensor): Input sequences.
num_actions (int): Number of actions.
Returns:
torch.Tensor: Action log probs.
"""
- logits = output['logits']
log_probs = _log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
return log_probs[:, -num_actions:]
diff --git a/applications/Chat/coati/quant/__init__.py b/applications/Chat/coati/quant/__init__.py
index a65a78d07bb8..1765b8091bc3 100644
--- a/applications/Chat/coati/quant/__init__.py
+++ b/applications/Chat/coati/quant/__init__.py
@@ -2,6 +2,6 @@
from .utils import low_resource_init
__all__ = [
- 'llama_load_quant',
- 'low_resource_init',
+ "llama_load_quant",
+ "low_resource_init",
]
diff --git a/applications/Chat/coati/quant/llama_gptq/__init__.py b/applications/Chat/coati/quant/llama_gptq/__init__.py
index 51c8d6316290..51d5233586ad 100644
--- a/applications/Chat/coati/quant/llama_gptq/__init__.py
+++ b/applications/Chat/coati/quant/llama_gptq/__init__.py
@@ -1,5 +1,5 @@
from .loader import load_quant
__all__ = [
- 'load_quant',
+ "load_quant",
]
diff --git a/applications/Chat/coati/quant/llama_gptq/loader.py b/applications/Chat/coati/quant/llama_gptq/loader.py
index 5353dc8a2ea3..50486337a7ab 100644
--- a/applications/Chat/coati/quant/llama_gptq/loader.py
+++ b/applications/Chat/coati/quant/llama_gptq/loader.py
@@ -11,14 +11,15 @@ def load_quant(model: nn.Module, checkpoint: str, wbits: int, groupsize: int):
# ignore lm head
layers = find_layers(model)
- for name in ['lm_head']:
+ for name in ["lm_head"]:
if name in layers:
del layers[name]
make_quant(model, layers, wbits, groupsize)
- if checkpoint.endswith('.safetensors'):
+ if checkpoint.endswith(".safetensors"):
from safetensors.torch import load_file as safe_load
+
model.load_state_dict(safe_load(checkpoint))
else:
model.load_state_dict(torch.load(checkpoint))
diff --git a/applications/Chat/coati/quant/llama_gptq/model_utils.py b/applications/Chat/coati/quant/llama_gptq/model_utils.py
index 62db171abb52..18e4e4761500 100644
--- a/applications/Chat/coati/quant/llama_gptq/model_utils.py
+++ b/applications/Chat/coati/quant/llama_gptq/model_utils.py
@@ -1,13 +1,12 @@
# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/modelutils.py
-import torch
import torch.nn as nn
-def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=""):
if type(module) in layers:
return {name: module}
res = {}
for name1, child in module.named_children():
- res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
+ res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
return res
diff --git a/applications/Chat/coati/quant/llama_gptq/quant.py b/applications/Chat/coati/quant/llama_gptq/quant.py
index f7d5b7ce4bd8..5a7e2e72dfc5 100644
--- a/applications/Chat/coati/quant/llama_gptq/quant.py
+++ b/applications/Chat/coati/quant/llama_gptq/quant.py
@@ -13,14 +13,13 @@ def quantize(x, scale, zero, maxq):
class Quantizer(nn.Module):
-
def __init__(self, shape=1):
super(Quantizer, self).__init__()
- self.register_buffer('maxq', torch.tensor(0))
- self.register_buffer('scale', torch.zeros(shape))
- self.register_buffer('zero', torch.zeros(shape))
+ self.register_buffer("maxq", torch.tensor(0))
+ self.register_buffer("scale", torch.zeros(shape))
+ self.register_buffer("zero", torch.zeros(shape))
- def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8):
+ def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=0.8):
self.maxq = torch.tensor(2**bits - 1)
self.perchannel = perchannel
self.sym = sym
@@ -68,7 +67,7 @@ def find_params(self, x, weight=False):
self.zero = torch.round(-xmin / self.scale)
if self.mse:
- best = torch.full([x.shape[0]], float('inf'), device=dev)
+ best = torch.full([x.shape[0]], float("inf"), device=dev)
for i in range(int(self.maxshrink * self.grid)):
p = 1 - i / self.grid
xmin1 = p * xmin
@@ -123,13 +122,12 @@ def ready(self):
try:
import quant_cuda
except:
- print('CUDA extension not installed.')
+ print("CUDA extension not installed.")
# Assumes layer is perfectly divisible into 256 * 256 blocks
class QuantLinear(nn.Module):
-
def __init__(self, bits, groupsize, infeatures, outfeatures):
super().__init__()
if bits not in [2, 3, 4, 8]:
@@ -142,11 +140,11 @@ def __init__(self, bits, groupsize, infeatures, outfeatures):
groupsize = groupsize if groupsize != -1 else infeatures
self.groupsize = groupsize
self.register_buffer(
- 'qzeros', torch.zeros((math.ceil(infeatures / groupsize), outfeatures // 256 * (bits * 8)),
- dtype=torch.int))
- self.register_buffer('scales', torch.zeros((math.ceil(infeatures / groupsize), outfeatures)))
- self.register_buffer('bias', torch.zeros(outfeatures))
- self.register_buffer('qweight', torch.zeros((infeatures // 256 * (bits * 8), outfeatures), dtype=torch.int))
+ "qzeros", torch.zeros((math.ceil(infeatures / groupsize), outfeatures // 256 * (bits * 8)), dtype=torch.int)
+ )
+ self.register_buffer("scales", torch.zeros((math.ceil(infeatures / groupsize), outfeatures)))
+ self.register_buffer("bias", torch.zeros(outfeatures))
+ self.register_buffer("qweight", torch.zeros((infeatures // 256 * (bits * 8), outfeatures), dtype=torch.int))
self._initialized_quant_state = False
def pack(self, linear, scales, zeros):
@@ -161,8 +159,10 @@ def pack(self, linear, scales, zeros):
for idx in range(self.infeatures):
g_idx = idx // self.groupsize
intweight.append(
- torch.round((linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:,
- None])
+ torch.round((linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[
+ :, None
+ ]
+ )
intweight = torch.cat(intweight, dim=1)
intweight = intweight.t().contiguous()
intweight = intweight.numpy().astype(np.uint32)
@@ -271,13 +271,13 @@ def forward(self, x):
return y.reshape(outshape)
-def make_quant(module, names, bits, groupsize, name=''):
+def make_quant(module, names, bits, groupsize, name=""):
if isinstance(module, QuantLinear):
return
for attr in dir(module):
tmp = getattr(module, attr)
- name1 = name + '.' + attr if name != '' else attr
+ name1 = name + "." + attr if name != "" else attr
if name1 in names:
setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features))
for name1, child in module.named_children():
- make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
+ make_quant(child, names, bits, groupsize, name + "." + name1 if name != "" else name1)
diff --git a/applications/Chat/coati/quant/utils.py b/applications/Chat/coati/quant/utils.py
index 01b8cff0add1..d102bb30f52d 100644
--- a/applications/Chat/coati/quant/utils.py
+++ b/applications/Chat/coati/quant/utils.py
@@ -9,8 +9,7 @@ def _noop(*args, **kwargs):
@contextmanager
def low_resource_init():
- """This context manager disables weight initialization and sets the default float dtype to half.
- """
+ """This context manager disables weight initialization and sets the default float dtype to half."""
old_kaiming_uniform_ = torch.nn.init.kaiming_uniform_
old_uniform_ = torch.nn.init.uniform_
old_normal_ = torch.nn.init.normal_
diff --git a/applications/Chat/coati/ray/callbacks/base.py b/applications/Chat/coati/ray/callbacks/base.py
index 3306150a41ff..8c5bd8a67776 100644
--- a/applications/Chat/coati/ray/callbacks/base.py
+++ b/applications/Chat/coati/ray/callbacks/base.py
@@ -5,7 +5,7 @@
class TrainerCallback(ABC):
"""
- Base callback class. It defines the interface for callbacks.
+ Base callback class. It defines the interface for callbacks.
"""
def on_fit_start(self) -> None:
@@ -40,7 +40,6 @@ def on_update_end(self) -> None:
class MakerCallback(ABC):
-
def on_loop_start(self) -> None:
pass
diff --git a/applications/Chat/coati/ray/callbacks/performance_evaluator.py b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
index d3df8f9ae3e0..18798bce7dce 100644
--- a/applications/Chat/coati/ray/callbacks/performance_evaluator.py
+++ b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
@@ -30,10 +30,9 @@ def all_reduce_mean(x: float, world_size: int) -> float:
class Timer:
-
def __init__(self) -> None:
self.start_time: Optional[float] = None
- self.duration: float = 0.
+ self.duration: float = 0.0
def start(self) -> None:
self.start_time = time()
@@ -42,13 +41,13 @@ def end(self) -> None:
self.duration += time() - self.start_time
def reset(self) -> None:
- self.duration = 0.
+ self.duration = 0.0
class ExperienceMakerPerformanceEvaluator(MakerCallback):
-
- def __init__(self, actor_num_params: int, critic_num_params: int, initial_model_num_params: int,
- reward_model_num_params: int) -> None:
+ def __init__(
+ self, actor_num_params: int, critic_num_params: int, initial_model_num_params: int, reward_model_num_params: int
+ ) -> None:
super().__init__()
self.world_size = get_world_size()
self.actor_num_params = actor_num_params
@@ -63,7 +62,7 @@ def __init__(self, actor_num_params: int, critic_num_params: int, initial_model_
self.make_experience_flop: int = 0
print_rank_0(
- f'ExperienceMaker actor: {actor_num_params/1024**3:.2f}B, critic: {critic_num_params/1024**3:.2f}B, initial model: {initial_model_num_params/1024**3:.2f}B, reward model: {reward_model_num_params/1024**3:.2f}B, world size: {self.world_size}'
+ f"ExperienceMaker actor: {actor_num_params/1024**3:.2f}B, critic: {critic_num_params/1024**3:.2f}B, initial model: {initial_model_num_params/1024**3:.2f}B, reward model: {reward_model_num_params/1024**3:.2f}B, world size: {self.world_size}"
)
def on_make_experience_start(self) -> None:
@@ -110,27 +109,29 @@ def on_loop_end(self) -> None:
avg_throughput = self.total_samples * self.world_size / (avg_overall_duration + 1e-12)
avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
avg_time_per_sample = (avg_overall_duration + 1e-12) / (self.total_samples * self.world_size)
- avg_make_experience_time_per_sample = (avg_make_experience_duration + 1e-12) / \
- (self.total_samples * self.world_size)
+ avg_make_experience_time_per_sample = (avg_make_experience_duration + 1e-12) / (
+ self.total_samples * self.world_size
+ )
avg_send_time_per_sample = (avg_send_duration + 1e-12) / (self.total_samples * self.world_size)
print_rank_0(
- 'Making Experience Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n'
- + f'TFLOPS per GPU: {avg_make_experience_tflops:.3f}\n'
- + f'Sample time (overall): {avg_time_per_sample:.3f} s\n'
- + f'Sample time (make experience): {avg_make_experience_time_per_sample:.3f} s, {avg_make_experience_time_per_sample/avg_time_per_sample*100:.2f}%\n'
-
- + f'Sample time (send): {avg_send_time_per_sample:.3f} s, {avg_send_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+ "Making Experience Performance Summary:\n"
+ + f"Throughput: {avg_throughput:.3f} samples/sec\n"
+ + f"TFLOPS per GPU: {avg_make_experience_tflops:.3f}\n"
+ + f"Sample time (overall): {avg_time_per_sample:.3f} s\n"
+ + f"Sample time (make experience): {avg_make_experience_time_per_sample:.3f} s, {avg_make_experience_time_per_sample/avg_time_per_sample*100:.2f}%\n"
+ + f"Sample time (send): {avg_send_time_per_sample:.3f} s, {avg_send_time_per_sample/avg_time_per_sample*100:.2f}%\n"
)
class TrainerPerformanceEvaluator(TrainerCallback):
-
- def __init__(self,
- actor_num_params: int,
- critic_num_params: int,
- enable_grad_checkpoint: bool = False,
- ignore_first_episodes: int = 1) -> None:
+ def __init__(
+ self,
+ actor_num_params: int,
+ critic_num_params: int,
+ enable_grad_checkpoint: bool = False,
+ ignore_first_episodes: int = 1,
+ ) -> None:
super().__init__()
self.world_size = get_world_size()
self.actor_num_params = actor_num_params
@@ -146,7 +147,7 @@ def __init__(self,
self.learn_flop: int = 0
print_rank_0(
- f'Trainer actor: {self.actor_num_params/1024**3:.2f}B, critic: {self.critic_num_params/1024**3:.2f}B, world size: {self.world_size}'
+ f"Trainer actor: {self.actor_num_params/1024**3:.2f}B, critic: {self.critic_num_params/1024**3:.2f}B, world size: {self.world_size}"
)
def on_episode_start(self, episodes: int) -> None:
@@ -191,7 +192,7 @@ def on_update_end(self) -> None:
def on_fit_end(self) -> None:
if self.total_samples == 0:
- print_rank_0('No samples are collected, skip trainer performance evaluation')
+ print_rank_0("No samples are collected, skip trainer performance evaluation")
return
avg_train_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
avg_update_duration = all_reduce_mean(self.update_timer.duration, self.world_size)
@@ -204,9 +205,10 @@ def on_fit_end(self) -> None:
avg_update_time_per_sample = (avg_update_duration + 1e-12) / (self.total_samples * self.world_size)
print_rank_0(
- 'Learning Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n'
- + f'TFLOPS per GPU: {avg_learn_tflops:.3f}\n' + f'Sample time (overall): {avg_time_per_sample:.3f} s\n'
- + f'Sample time (train): {avg_train_time_per_sample:.3f} s, {avg_train_time_per_sample/avg_time_per_sample*100:.2f}%\n'
-
- + f'Sample time (update): {avg_update_time_per_sample:.3f} s, {avg_update_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+ "Learning Performance Summary:\n"
+ + f"Throughput: {avg_throughput:.3f} samples/sec\n"
+ + f"TFLOPS per GPU: {avg_learn_tflops:.3f}\n"
+ + f"Sample time (overall): {avg_time_per_sample:.3f} s\n"
+ + f"Sample time (train): {avg_train_time_per_sample:.3f} s, {avg_train_time_per_sample/avg_time_per_sample*100:.2f}%\n"
+ + f"Sample time (update): {avg_update_time_per_sample:.3f} s, {avg_update_time_per_sample/avg_time_per_sample*100:.2f}%\n"
)
diff --git a/applications/Chat/coati/ray/detached_replay_buffer.py b/applications/Chat/coati/ray/detached_replay_buffer.py
index e04bf5ccb881..92dab17292f7 100644
--- a/applications/Chat/coati/ray/detached_replay_buffer.py
+++ b/applications/Chat/coati/ray/detached_replay_buffer.py
@@ -1,20 +1,15 @@
-import asyncio
-import copy
-import random
-from threading import Lock
-from typing import Any, List
+from typing import List
-import ray
import torch
-from coati.experience_buffer import ExperienceBuffer
from coati.experience_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
from coati.experience_maker.base import Experience
+
# from torch.multiprocessing import Queue
from ray.util.queue import Queue
class DetachedReplayBuffer:
- '''
+ """
Detached replay buffer. Share Experience across workers on the same node.
Therefore, a trainer node is expected to have only one instance.
It is ExperienceMakerHolder's duty to call append(exp) method, remotely.
@@ -24,7 +19,7 @@ class DetachedReplayBuffer:
tp_world_size: Number of workers in the same tp group
limit: Limit of number of experience sample BATCHs. A number <= 0 means unlimited. Defaults to 0.
cpu_offload: Whether to offload experience to cpu when sampling. Defaults to True.
- '''
+ """
def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
self.sample_batch_size = sample_batch_size
@@ -34,23 +29,23 @@ def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
@torch.no_grad()
def append(self, experience: Experience) -> None:
- '''
+ """
Expected to be called remotely.
- '''
+ """
items = split_experience_batch(experience)
self.extend(items)
@torch.no_grad()
def extend(self, items: List[BufferItem]) -> None:
- '''
+ """
Expected to be called remotely.
- '''
+ """
self.batch_collector.extend(items)
while len(self.batch_collector) >= self.sample_batch_size:
- items = self.batch_collector[:self.sample_batch_size]
+ items = self.batch_collector[: self.sample_batch_size]
experience = make_experience_batch(items)
self.items.put(experience, block=True)
- self.batch_collector = self.batch_collector[self.sample_batch_size:]
+ self.batch_collector = self.batch_collector[self.sample_batch_size :]
def clear(self) -> None:
# self.items.close()
diff --git a/applications/Chat/coati/ray/detached_trainer_base.py b/applications/Chat/coati/ray/detached_trainer_base.py
index 90399781187a..fcf0a472df9e 100644
--- a/applications/Chat/coati/ray/detached_trainer_base.py
+++ b/applications/Chat/coati/ray/detached_trainer_base.py
@@ -1,6 +1,6 @@
import os
from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, List
import ray
import torch
@@ -15,7 +15,7 @@
class DetachedTrainer(ABC):
- '''
+ """
Base class for detached rlhf trainers.
'detach' means that the experience maker is detached compared to a normal Trainer.
Please set name attribute during init:
@@ -28,15 +28,17 @@ class DetachedTrainer(ABC):
callbacks (List[Callback], defaults to []): the callbacks to call during training process
generate_kwargs (dict, optional): the kwargs to use while model generating
- '''
-
- def __init__(self,
- experience_maker_holder_name_list: List[str],
- train_batch_size: int = 8,
- buffer_limit: int = 0,
- dataloader_pin_memory: bool = True,
- callbacks: List[TrainerCallback] = [],
- debug: bool = False) -> None:
+ """
+
+ def __init__(
+ self,
+ experience_maker_holder_name_list: List[str],
+ train_batch_size: int = 8,
+ buffer_limit: int = 0,
+ dataloader_pin_memory: bool = True,
+ callbacks: List[TrainerCallback] = [],
+ debug: bool = False,
+ ) -> None:
super().__init__()
self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit)
self.dataloader_pin_memory = dataloader_pin_memory
@@ -67,18 +69,16 @@ def training_step(self, experience: Experience) -> Dict[str, Any]:
def _learn(self, update_steps: int, train_epochs: int) -> None:
data = []
# warmup
- pbar = tqdm(range(update_steps), desc=f'Train epoch [1/{train_epochs}]', disable=not is_rank_0())
+ pbar = tqdm(range(update_steps), desc=f"Train epoch [1/{train_epochs}]", disable=not is_rank_0())
self._on_epoch_start(0)
self._learn_epoch(pbar, data)
self._on_epoch_end(0)
# item is already a batch
- dataloader = DataLoader(data,
- batch_size=1,
- shuffle=True,
- pin_memory=self.dataloader_pin_memory,
- collate_fn=lambda x: x[0])
+ dataloader = DataLoader(
+ data, batch_size=1, shuffle=True, pin_memory=self.dataloader_pin_memory, collate_fn=lambda x: x[0]
+ )
for epoch in range(1, train_epochs):
- pbar = tqdm(dataloader, desc=f'Train epoch [{epoch + 1}/{train_epochs}]', disable=not is_rank_0())
+ pbar = tqdm(dataloader, desc=f"Train epoch [{epoch + 1}/{train_epochs}]", disable=not is_rank_0())
self._on_epoch_start(epoch)
self._learn_epoch(pbar, data)
self._on_epoch_end(epoch)
@@ -104,7 +104,7 @@ def _learn_epoch(self, pbar: tqdm, data: List[Experience]) -> None:
def fit(self, total_steps: int, update_steps: int, train_epochs: int = 1) -> None:
self._on_fit_start()
- for i in tqdm(range(total_steps // update_steps), desc='Trainer', disable=not is_rank_0()):
+ for i in tqdm(range(total_steps // update_steps), desc="Trainer", disable=not is_rank_0()):
self._on_episode_start(i)
self._learn(update_steps, train_epochs)
self._on_update_start()
diff --git a/applications/Chat/coati/ray/detached_trainer_ppo.py b/applications/Chat/coati/ray/detached_trainer_ppo.py
index 2f2aa0e29579..ef84a1ddba48 100644
--- a/applications/Chat/coati/ray/detached_trainer_ppo.py
+++ b/applications/Chat/coati/ray/detached_trainer_ppo.py
@@ -1,12 +1,11 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Tuple
import ray
import torch
-from coati.experience_maker import Experience, NaiveExperienceMaker
+from coati.experience_maker import Experience
from coati.models.base import Actor, Critic
from coati.models.loss import PolicyLoss, ValueLoss
-from coati.trainer.callbacks import Callback
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
+from coati.trainer.strategies import GeminiStrategy, LowLevelZeroStrategy, Strategy
from torch.optim import Adam
from colossalai.nn.optimizer import HybridAdam
@@ -14,27 +13,14 @@
from .callbacks import TrainerCallback, TrainerPerformanceEvaluator
from .detached_trainer_base import DetachedTrainer
from .lora_constructor import LoRAConstructor
-from .utils import (
- get_actor_from_args,
- get_critic_from_args,
- get_model_numel,
- get_rank,
- get_strategy_from_args,
- is_rank_0,
- set_dist_env,
- state_dict_to,
-)
+from .utils import get_model_numel, get_rank, set_dist_env, state_dict_to
-@ray.remote(concurrency_groups={
- "buffer_length": 1,
- "buffer_append": 1,
- "buffer_sample": 1,
- "model_io": 1,
- "compute": 1
-})
+@ray.remote(
+ concurrency_groups={"buffer_length": 1, "buffer_append": 1, "buffer_sample": 1, "model_io": 1, "compute": 1}
+)
class DetachedPPOTrainer(DetachedTrainer):
- '''
+ """
Detached Trainer for PPO algorithm
Args:
strategy (Strategy): the strategy to use for training
@@ -52,7 +38,7 @@ class DetachedPPOTrainer(DetachedTrainer):
dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
callbacks (List[Callback], defaults to []): the callbacks to call during training process
generate_kwargs (dict, optional): the kwargs to use while model generating
- '''
+ """
def __init__(
self,
@@ -92,21 +78,24 @@ def __init__(
self.actor_optim = Adam(self.actor.parameters(), lr=1e-7)
self.critic_optim = Adam(self.critic.parameters(), lr=1e-7)
- (self.actor, self.actor_optim), (self.critic, self.critic_optim) = \
- self.strategy.prepare((self.actor, self.actor_optim), (self.critic, self.critic_optim))
+ (self.actor, self.actor_optim), (self.critic, self.critic_optim) = self.strategy.prepare(
+ (self.actor, self.actor_optim), (self.critic, self.critic_optim)
+ )
# configure trainer
self.actor_loss_fn = PolicyLoss(eps_clip)
self.critic_loss_fn = ValueLoss(value_clip)
- super().__init__(experience_maker_holder_name_list,
- train_batch_size=train_batch_size,
- buffer_limit=buffer_limit,
- dataloader_pin_memory=dataloader_pin_memory,
- callbacks=callbacks,
- debug=debug)
+ super().__init__(
+ experience_maker_holder_name_list,
+ train_batch_size=train_batch_size,
+ buffer_limit=buffer_limit,
+ dataloader_pin_memory=dataloader_pin_memory,
+ callbacks=callbacks,
+ debug=debug,
+ )
if self._debug:
- print(f'[trainer{get_rank()}] will send state dict to {experience_maker_holder_name_list}')
+ print(f"[trainer{get_rank()}] will send state dict to {experience_maker_holder_name_list}")
self._update_lora_weights = update_lora_weights
@@ -115,7 +104,7 @@ def __init__(
def _update_remote_makers(self, fully_update: bool = False, **config):
# TODO: balance duties
if not fully_update:
- config['requires_grad_only'] = True
+ config["requires_grad_only"] = True
self.update_target_holder_list()
# mark start, ensure order
tasks = []
@@ -131,7 +120,9 @@ def _update_remote_makers(self, fully_update: bool = False, **config):
target_holder.update_experience_maker.remote(
new_actor_state_dict=state_dict_shard,
new_actor_lora_config_dict=self._get_model_lora_config_dict(self.actor),
- fully_update=fully_update))
+ fully_update=fully_update,
+ )
+ )
# sending loop
for state_dict_shard in self._get_model_state_dict_shard(self.critic, fully_update=fully_update, **config):
for target_holder in self.target_holder_list:
@@ -139,7 +130,9 @@ def _update_remote_makers(self, fully_update: bool = False, **config):
target_holder.update_experience_maker.remote(
new_critic_state_dict=state_dict_shard,
new_critic_lora_config_dict=self._get_model_lora_config_dict(self.critic),
- fully_update=fully_update))
+ fully_update=fully_update,
+ )
+ )
ray.get(tasks)
# mark end
for target_holder in self.target_holder_list:
@@ -152,26 +145,24 @@ def training_step(self, experience: Experience) -> Dict[str, float]:
num_actions = experience.action_mask.size(1)
action_log_probs = self.actor(experience.sequences, num_actions, attention_mask=experience.attention_mask)
- actor_loss = self.actor_loss_fn(action_log_probs,
- experience.action_log_probs,
- experience.advantages,
- action_mask=experience.action_mask)
+ actor_loss = self.actor_loss_fn(
+ action_log_probs, experience.action_log_probs, experience.advantages, action_mask=experience.action_mask
+ )
self.strategy.backward(actor_loss, self.actor, self.actor_optim)
self.strategy.optimizer_step(self.actor_optim)
self.actor_optim.zero_grad()
- values = self.critic(experience.sequences,
- action_mask=experience.action_mask,
- attention_mask=experience.attention_mask)
- critic_loss = self.critic_loss_fn(values,
- experience.values,
- experience.reward,
- action_mask=experience.action_mask)
+ values = self.critic(
+ experience.sequences, action_mask=experience.action_mask, attention_mask=experience.attention_mask
+ )
+ critic_loss = self.critic_loss_fn(
+ values, experience.values, experience.reward, action_mask=experience.action_mask
+ )
self.strategy.backward(critic_loss, self.critic, self.critic_optim)
self.strategy.optimizer_step(self.critic_optim)
self.critic_optim.zero_grad()
- return {'actor_loss': actor_loss.item(), 'critic_loss': critic_loss.item()}
+ return {"actor_loss": actor_loss.item(), "critic_loss": critic_loss.item()}
def strategy_save_actor(self, path: str, only_rank0: bool = False) -> None:
self.strategy.save_model(self.actor, path, only_rank0)
diff --git a/applications/Chat/coati/ray/experience_maker_holder.py b/applications/Chat/coati/ray/experience_maker_holder.py
index 13314bdafd5f..4d290f4aba88 100644
--- a/applications/Chat/coati/ray/experience_maker_holder.py
+++ b/applications/Chat/coati/ray/experience_maker_holder.py
@@ -1,53 +1,49 @@
import os
import time
import tracemalloc
-from copy import deepcopy
from threading import Lock
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
import ray
import torch
-import torch.nn as nn
-from coati.experience_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
-from coati.experience_maker import Experience, ExperienceMaker, NaiveExperienceMaker
+from coati.experience_buffer.utils import split_experience_batch
+from coati.experience_maker import Experience, NaiveExperienceMaker
from coati.models.base import Actor, Critic, RewardModel
-from coati.trainer.callbacks import Callback
from coati.trainer.strategies import Strategy
-from coati.trainer.strategies.sampler import DistributedSampler
-from ray.exceptions import GetTimeoutError
from torch import Tensor
from tqdm import tqdm
from .callbacks import ExperienceMakerPerformanceEvaluator, MakerCallback
from .lora_constructor import LoRAConstructor
-from .utils import get_model_numel, get_rank, get_world_size, is_rank_0, set_dist_env, state_dict_to
+from .utils import get_model_numel, get_rank, is_rank_0, set_dist_env, state_dict_to
@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
class ExperienceMakerHolder:
- '''
+ """
Args:
detached_trainer_name_list: str list to get ray actor handles
strategy:
kl_coef: the coefficient of kl divergence loss
sync_models_from_trainers: whether to sync models from trainers. If True, you must call sync_models_to_remote_makers() in trainers to sync models.
- '''
+ """
def __init__(
- self,
- detached_trainer_name_list: List[str],
- strategy_fn: Callable[[], Strategy],
+ self,
+ detached_trainer_name_list: List[str],
+ strategy_fn: Callable[[], Strategy],
# a function returns (actor, critic, reward_model, initial_model)
- model_fn: Callable[[], Tuple[Actor, Critic, RewardModel, Actor]],
- env_info: Dict[str, str] = None,
- sync_models_from_trainers: bool = False,
- buffer_cpu_offload: bool = True,
- kl_coef: float = 0.1,
- callbacks: List[MakerCallback] = [],
- eval_performance: bool = False,
- debug: bool = False,
- update_lora_weights: bool = False,
- **generate_kwargs):
+ model_fn: Callable[[], Tuple[Actor, Critic, RewardModel, Actor]],
+ env_info: Dict[str, str] = None,
+ sync_models_from_trainers: bool = False,
+ buffer_cpu_offload: bool = True,
+ kl_coef: float = 0.1,
+ callbacks: List[MakerCallback] = [],
+ eval_performance: bool = False,
+ debug: bool = False,
+ update_lora_weights: bool = False,
+ **generate_kwargs,
+ ):
# set environment variables
if env_info:
set_dist_env(env_info=env_info)
@@ -66,8 +62,9 @@ def __init__(
critic_numel = get_model_numel(critic)
initial_model_numel = get_model_numel(initial_model)
reward_model_numel = get_model_numel(reward_model)
- evaluator = ExperienceMakerPerformanceEvaluator(actor_numel, critic_numel, initial_model_numel,
- reward_model_numel)
+ evaluator = ExperienceMakerPerformanceEvaluator(
+ actor_numel, critic_numel, initial_model_numel, reward_model_numel
+ )
callbacks = callbacks + [evaluator]
actor, critic, reward_model, initial_model = self.strategy.prepare(actor, critic, reward_model, initial_model)
@@ -89,9 +86,9 @@ def __init__(
self._target_idx = 0
if self._debug:
- print(f'[maker{get_rank()}] will send items to {self._detached_trainer_name_list}')
+ print(f"[maker{get_rank()}] will send items to {self._detached_trainer_name_list}")
if not self._is_fully_initialized:
- print(f'[maker{get_rank()}] Waiting for INIT')
+ print(f"[maker{get_rank()}] Waiting for INIT")
def _get_ready(self):
while not self._fully_initialized():
@@ -136,7 +133,7 @@ def _inference_step(self, batch) -> None:
self._on_make_experience_end(experience)
self._on_send_start()
if self.buffer_cpu_offload:
- experience.to_device('cpu')
+ experience.to_device("cpu")
self._send_items(experience)
self._on_send_end()
self._on_batch_end()
@@ -155,7 +152,7 @@ def workingloop(self, dataloader_fn: Callable[[], Iterable], num_epochs: int = 1
if num_steps > 0:
# ignore num epochs
it = iter(dataloader)
- for _ in tqdm(range(num_steps), desc='ExperienceMaker', disable=not is_rank_0()):
+ for _ in tqdm(range(num_steps), desc="ExperienceMaker", disable=not is_rank_0()):
try:
batch = next(it)
except StopIteration:
@@ -163,7 +160,7 @@ def workingloop(self, dataloader_fn: Callable[[], Iterable], num_epochs: int = 1
batch = next(it)
self._inference_step(batch)
else:
- with tqdm(total=num_epochs * len(dataloader), desc='ExperienceMaker', disable=not is_rank_0()) as pbar:
+ with tqdm(total=num_epochs * len(dataloader), desc="ExperienceMaker", disable=not is_rank_0()) as pbar:
for _ in range(num_epochs):
for batch in dataloader:
self._inference_step(batch)
@@ -171,22 +168,24 @@ def workingloop(self, dataloader_fn: Callable[[], Iterable], num_epochs: int = 1
self._on_loop_end()
@ray.method(concurrency_group="model_io")
- def update_experience_maker(self,
- new_actor_state_dict: Dict[str, Any] = None,
- new_actor_lora_config_dict: Dict[str, Any] = None,
- new_critic_state_dict: Dict[str, Any] = None,
- new_critic_lora_config_dict: Dict[str, Any] = None,
- fully_update: bool = False,
- chunk_start: bool = None,
- chunk_end: bool = None):
- '''
- called by trainer
- chunk_start: Set True at the first call. Before sending state_dict calls
- chunk_end: Set True at the last call. After sending state_dict calls.
- fully_update: Set True if you want to sync models when initializing
-
- TODO: load_state_dict integrate with model-sharding strategy
- '''
+ def update_experience_maker(
+ self,
+ new_actor_state_dict: Dict[str, Any] = None,
+ new_actor_lora_config_dict: Dict[str, Any] = None,
+ new_critic_state_dict: Dict[str, Any] = None,
+ new_critic_lora_config_dict: Dict[str, Any] = None,
+ fully_update: bool = False,
+ chunk_start: bool = None,
+ chunk_end: bool = None,
+ ):
+ """
+ called by trainer
+ chunk_start: Set True at the first call. Before sending state_dict calls
+ chunk_end: Set True at the last call. After sending state_dict calls.
+ fully_update: Set True if you want to sync models when initializing
+
+ TODO: load_state_dict integrate with model-sharding strategy
+ """
_watch_memory = self._debug
if chunk_start:
if self._debug:
@@ -202,18 +201,22 @@ def update_experience_maker(self,
else:
new_actor_state_dict = state_dict_to(new_actor_state_dict, device=torch.cuda.current_device())
state_dict_increase = self.actor_lora_constructor.reconstruct_increase(
- new_actor_state_dict, new_actor_lora_config_dict)
+ new_actor_state_dict, new_actor_lora_config_dict
+ )
self.actor_lora_constructor.load_state_dict_increase(
- self.experience_maker.actor.model, state_dict_increase)
+ self.experience_maker.actor.model, state_dict_increase
+ )
if new_critic_state_dict is not None:
if not self._update_lora_weights or fully_update:
self.experience_maker.critic.load_state_dict(new_critic_state_dict, strict=False)
else:
new_critic_state_dict = state_dict_to(new_critic_state_dict, device=torch.cuda.current_device())
state_dict_increase = self.critic_lora_constructor.reconstruct_increase(
- new_critic_state_dict, new_critic_lora_config_dict)
+ new_critic_state_dict, new_critic_lora_config_dict
+ )
self.critic_lora_constructor.load_state_dict_increase(
- self.experience_maker.critic, state_dict_increase)
+ self.experience_maker.critic, state_dict_increase
+ )
# the lock must be released after both actor and critic being updated
if chunk_end:
@@ -262,10 +265,10 @@ def _set_default_generate_kwargs(generate_kwargs: dict, actor: Actor) -> None:
origin_model = actor.model
new_kwargs = {**generate_kwargs}
# use huggingface models method directly
- if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
- new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
+ if "prepare_inputs_fn" not in generate_kwargs and hasattr(origin_model, "prepare_inputs_for_generation"):
+ new_kwargs["prepare_inputs_fn"] = origin_model.prepare_inputs_for_generation
- if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(origin_model, '_update_model_kwargs_for_generation'):
- new_kwargs['update_model_kwargs_fn'] = origin_model._update_model_kwargs_for_generation
+ if "update_model_kwargs_fn" not in generate_kwargs and hasattr(origin_model, "_update_model_kwargs_for_generation"):
+ new_kwargs["update_model_kwargs_fn"] = origin_model._update_model_kwargs_for_generation
return new_kwargs
diff --git a/applications/Chat/coati/ray/lora_constructor.py b/applications/Chat/coati/ray/lora_constructor.py
index a98545d4d751..8e9f78700e29 100644
--- a/applications/Chat/coati/ray/lora_constructor.py
+++ b/applications/Chat/coati/ray/lora_constructor.py
@@ -1,11 +1,9 @@
from collections import OrderedDict
from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Dict
-import torch
import torch.nn as nn
from coati.models.lora import LoraLinear
-from loralib.layers import LoRALayer
@dataclass
@@ -17,7 +15,7 @@ class LoRAConfig:
class LoRAConstructor:
- '''
+ """
Tools for reconstructing a model from a remote LoRA model.
(Transferring only LoRA data costs much less!)
Usage:
@@ -36,7 +34,7 @@ class LoRAConstructor:
Step 5 (Receiver):
load_state_dict_increase()
- '''
+ """
def __init__(self):
self.lora_config_dict = None
@@ -45,10 +43,10 @@ def register_lora_config(self, lora_config_dict: Dict[str, Any]):
self.lora_config_dict = lora_config_dict
def reconstruct_increase(self, state_dict_lora: Dict[str, Any], lora_config_dict: Dict[str, Any]):
- '''
- xxx.lora_A, xxx.lora_B -->> xxx.weight
- Warning: the xxx.weight here is the increment actually.
- '''
+ """
+ xxx.lora_A, xxx.lora_B -->> xxx.weight
+ Warning: the xxx.weight here is the increment actually.
+ """
if lora_config_dict is not None:
self.register_lora_config(lora_config_dict)
@@ -56,24 +54,25 @@ def reconstruct_increase(self, state_dict_lora: Dict[str, Any], lora_config_dict
config_iter = iter(self.lora_config_dict.items())
lora_A, lora_B, layer_prefix = None, None, None
for k, v in state_dict_lora.items():
- if k.rpartition('.')[-1] == 'lora_A':
+ if k.rpartition(".")[-1] == "lora_A":
lora_A = v
- layer_prefix = k.rpartition('.')[0]
- elif k.rpartition('.')[-1] == 'lora_B':
- assert layer_prefix == k.rpartition('.')[0], "unmatched (lora_A, lora_B) pair"
+ layer_prefix = k.rpartition(".")[0]
+ elif k.rpartition(".")[-1] == "lora_B":
+ assert layer_prefix == k.rpartition(".")[0], "unmatched (lora_A, lora_B) pair"
layer_prefix_2, config = next(config_iter)
assert layer_prefix_2 == layer_prefix, "unmatched (state_dict, config_dict) pair"
lora_B = v
weight_data_increase = self._compute(lora_A, lora_B, config)
- state_dict_increase[layer_prefix + '.weight'] = weight_data_increase
+ state_dict_increase[layer_prefix + ".weight"] = weight_data_increase
lora_A, lora_B, layer_prefix = None, None, None
else:
- raise ValueError('unexpected key')
+ raise ValueError("unexpected key")
return state_dict_increase
def _compute(self, lora_A, lora_B, config=LoRAConfig()):
def T(w):
return w.T if config.fan_in_fan_out else w
+
if config.r > 0:
scaling = config.lora_alpha / config.r
weight_data_increase = T(lora_B @ lora_A) * scaling
@@ -81,21 +80,21 @@ def T(w):
return 0
def load_state_dict_increase(self, model: nn.Module, state_dict_increase: Dict[str, Any]):
- '''
+ """
The final reconstruction step
- '''
+ """
# naive approach
model.load_state_dict({k: v + model.state_dict()[k] for k, v in state_dict_increase.items()}, strict=False)
@staticmethod
def filter_state_dict_lora(state_dict: Dict[str, Any], keep_non_lora=False):
- '''
+ """
if keep_non_lora, also return non_lora state_dict
- '''
+ """
state_dict_lora = OrderedDict()
state_dict_non_lora = OrderedDict()
for k, v in state_dict.items():
- if 'lora_A' in k or 'lora_B' in k:
+ if "lora_A" in k or "lora_B" in k:
state_dict_lora[k] = v
elif keep_non_lora:
state_dict_non_lora[k] = v
@@ -106,17 +105,19 @@ def filter_state_dict_lora(state_dict: Dict[str, Any], keep_non_lora=False):
@staticmethod
def extract_lora_config(model: nn.Module) -> Dict[str, LoRAConfig]:
- '''
+ """
extract LoraLinear model.
return OrderedDict(): name -> LoRAConfig
- '''
+ """
lora_config_dict = OrderedDict()
for name, child in model.named_modules():
if isinstance(child, LoraLinear):
- lora_config_dict[name] = LoRAConfig(r=child.r,
- lora_alpha=child.lora_alpha,
- lora_dropout=child.lora_dropout,
- fan_in_fan_out=child.fan_in_fan_out)
+ lora_config_dict[name] = LoRAConfig(
+ r=child.r,
+ lora_alpha=child.lora_alpha,
+ lora_dropout=child.lora_dropout,
+ fan_in_fan_out=child.fan_in_fan_out,
+ )
return lora_config_dict
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
index 391ffe7a91a9..b88140c0e036 100644
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -1,6 +1,6 @@
import os
from collections import OrderedDict
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Dict
import torch
import torch.distributed as dist
@@ -10,7 +10,7 @@
from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
from coati.models.opt import OPTRM, OPTActor, OPTCritic
from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
+from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer
def is_rank_0() -> bool:
@@ -26,13 +26,13 @@ def get_world_size() -> int:
def get_actor_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
- if model == 'gpt2':
+ if model == "gpt2":
actor = GPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
- elif model == 'bloom':
+ elif model == "bloom":
actor = BLOOMActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
- elif model == 'opt':
+ elif model == "opt":
actor = OPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
- elif model == 'llama':
+ elif model == "llama":
actor = LlamaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
else:
raise ValueError(f'Unsupported actor model "{model}"')
@@ -40,27 +40,27 @@ def get_actor_from_args(model: str, pretrained: str = None, config=None, lora_ra
def get_critic_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
- if model == 'gpt2':
- critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
- elif model == 'bloom':
- critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
- elif model == 'opt':
- critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
- elif model == 'llama':
- critic = LlamaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+ if model == "gpt2":
+ critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config)
+ elif model == "bloom":
+ critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank, config=config)
+ elif model == "opt":
+ critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config)
+ elif model == "llama":
+ critic = LlamaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config)
else:
raise ValueError(f'Unsupported reward model "{model}"')
return critic
def get_reward_model_from_args(model: str, pretrained: str = None, config=None):
- if model == 'gpt2':
+ if model == "gpt2":
reward_model = GPTRM(pretrained=pretrained, config=config)
- elif model == 'bloom':
+ elif model == "bloom":
reward_model = BLOOMRM(pretrained=pretrained, config=config)
- elif model == 'opt':
+ elif model == "opt":
reward_model = OPTRM(pretrained=pretrained, config=config)
- elif model == 'llama':
+ elif model == "llama":
reward_model = LlamaRM(pretrained=pretrained, config=config)
else:
raise ValueError(f'Unsupported reward model "{model}"')
@@ -68,29 +68,29 @@ def get_reward_model_from_args(model: str, pretrained: str = None, config=None):
def get_strategy_from_args(strategy: str):
- if strategy == 'ddp':
+ if strategy == "ddp":
strategy_ = DDPStrategy()
- elif strategy == 'colossalai_gemini':
- strategy_ = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
- elif strategy == 'colossalai_zero2':
- strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
- elif strategy == 'colossalai_gemini_cpu':
- strategy_ = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
- elif strategy == 'colossalai_zero2_cpu':
- strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
+ elif strategy == "colossalai_gemini":
+ strategy_ = GeminiStrategy(placement_policy="static", initial_scale=2**5)
+ elif strategy == "colossalai_zero2":
+ strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
+ elif strategy == "colossalai_gemini_cpu":
+ strategy_ = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+ elif strategy == "colossalai_zero2_cpu":
+ strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
else:
raise ValueError(f'Unsupported strategy "{strategy}"')
return strategy_
def get_tokenizer_from_args(model: str, **kwargs):
- if model == 'gpt2':
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
- elif model == 'bloom':
- tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
- elif model == 'opt':
+ if model == "gpt2":
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+ elif model == "bloom":
+ tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
+ elif model == "opt":
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
- elif model == 'llama':
+ elif model == "llama":
pretrain_path = kwargs["pretrain"]
tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
else:
@@ -101,11 +101,11 @@ def get_tokenizer_from_args(model: str, **kwargs):
def set_dist_env(env_info: Dict[str, str]):
- os.environ["RANK"] = env_info['rank']
- os.environ["LOCAL_RANK"] = env_info['local_rank']
- os.environ["WORLD_SIZE"] = env_info['world_size']
- os.environ['MASTER_PORT'] = env_info['master_port']
- os.environ['MASTER_ADDR'] = env_info['master_addr']
+ os.environ["RANK"] = env_info["rank"]
+ os.environ["LOCAL_RANK"] = env_info["local_rank"]
+ os.environ["WORLD_SIZE"] = env_info["world_size"]
+ os.environ["MASTER_PORT"] = env_info["master_port"]
+ os.environ["MASTER_ADDR"] = env_info["master_addr"]
def get_model_numel(model: nn.Module) -> int:
@@ -128,12 +128,12 @@ def get_receivers_per_sender(sender_idx: int, num_senders: int, num_receivers: i
return target_receivers
-def state_dict_to(state_dict: Dict[str, Any],
- dtype: torch.dtype = torch.float16,
- device: torch.device = torch.device('cpu')):
- '''
- keep state_dict intact
- '''
+def state_dict_to(
+ state_dict: Dict[str, Any], dtype: torch.dtype = torch.float16, device: torch.device = torch.device("cpu")
+):
+ """
+ keep state_dict intact
+ """
new_state_dict = OrderedDict()
for k, v in state_dict.items():
new_state_dict[k] = v.to(dtype=dtype, device=device)
diff --git a/applications/Chat/coati/trainer/__init__.py b/applications/Chat/coati/trainer/__init__.py
index 86142361f3ff..4be5d27f93b1 100644
--- a/applications/Chat/coati/trainer/__init__.py
+++ b/applications/Chat/coati/trainer/__init__.py
@@ -3,8 +3,4 @@
from .rm import RewardModelTrainer
from .sft import SFTTrainer
-__all__ = [
- 'SLTrainer', 'OnPolicyTrainer',
- 'RewardModelTrainer', 'SFTTrainer',
- 'PPOTrainer'
-]
+__all__ = ["SLTrainer", "OnPolicyTrainer", "RewardModelTrainer", "SFTTrainer", "PPOTrainer"]
diff --git a/applications/Chat/coati/trainer/base.py b/applications/Chat/coati/trainer/base.py
index 0629c9c00cca..0a41d450d41e 100644
--- a/applications/Chat/coati/trainer/base.py
+++ b/applications/Chat/coati/trainer/base.py
@@ -7,11 +7,10 @@
from coati.experience_buffer import NaiveExperienceBuffer
from coati.experience_maker import Experience
from torch.optim import Optimizer
-from torch.utils.data import DataLoader
from .callbacks import Callback
from .strategies import Strategy
-from .utils import CycledDataLoader, is_rank_0
+from .utils import is_rank_0
class SLTrainer(ABC):
@@ -47,11 +46,11 @@ def _eval(self, epoch):
raise NotImplementedError()
def _before_fit(self):
- self.no_epoch_bar = False
+ raise NotImplementedError()
def fit(self, *args, **kwargs):
self._before_fit(*args, **kwargs)
- for epoch in tqdm.trange(self.max_epochs, desc="Epochs", disable=not is_rank_0() or self.no_epoch_bar):
+ for epoch in tqdm.trange(self.max_epochs, desc="Epochs", disable=not is_rank_0()):
self._train(epoch)
self._eval(epoch)
@@ -68,12 +67,14 @@ class OnPolicyTrainer(ABC):
callbacks (List[Callback], defaults to []): the callbacks to call during training process
"""
- def __init__(self,
- strategy: Strategy,
- data_buffer: NaiveExperienceBuffer,
- sample_buffer: bool,
- dataloader_pin_memory: bool,
- callbacks: List[Callback] = []) -> None:
+ def __init__(
+ self,
+ strategy: Strategy,
+ data_buffer: NaiveExperienceBuffer,
+ sample_buffer: bool,
+ dataloader_pin_memory: bool,
+ callbacks: List[Callback] = [],
+ ) -> None:
super().__init__()
self.strategy = strategy
self.data_buffer = data_buffer
@@ -121,9 +122,9 @@ def _on_learn_batch_start(self) -> None:
for callback in self.callbacks:
callback.on_learn_batch_start()
- def _on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
+ def _on_learn_batch_end(self, experience: Experience) -> None:
for callback in self.callbacks:
- callback.on_learn_batch_end(metrics, experience)
+ callback.on_learn_batch_end(experience)
@abstractmethod
def _make_experience(self, collect_step: int):
@@ -151,27 +152,26 @@ def _update_phase(self, update_step: int):
self._learn(update_step)
self._on_learn_epoch_end(update_step)
+ def _before_fit(self, *args, **kwargs):
+ raise NotImplementedError()
+
def fit(
self,
- prompt_dataloader: DataLoader,
- pretrain_dataloader: DataLoader,
num_episodes: int,
num_collect_steps: int,
num_update_steps: int,
+ *args,
+ **kwargs,
):
"""
The main training loop of on-policy rl trainers.
Args:
- prompt_dataloader (DataLoader): the dataloader to use for prompt data
- pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
num_episodes (int): the number of episodes to train
num_collect_steps (int): the number of collect steps per episode
num_update_steps (int): the number of update steps per episode
"""
- self.prompt_dataloader = CycledDataLoader(prompt_dataloader)
- self.pretrain_dataloader = CycledDataLoader(pretrain_dataloader)
-
+ self._before_fit(*args, **kwargs)
with self._fit_ctx():
for episode in tqdm.trange(num_episodes, desc="Episodes", disable=not is_rank_0()):
with self._episode_ctx(episode):
diff --git a/applications/Chat/coati/trainer/callbacks/__init__.py b/applications/Chat/coati/trainer/callbacks/__init__.py
index 9ed0ee6f7640..29c8c4f00a5c 100644
--- a/applications/Chat/coati/trainer/callbacks/__init__.py
+++ b/applications/Chat/coati/trainer/callbacks/__init__.py
@@ -2,4 +2,4 @@
from .performance_evaluator import PerformanceEvaluator
from .save_checkpoint import SaveCheckpoint
-__all__ = ['Callback', 'PerformanceEvaluator', 'SaveCheckpoint']
+__all__ = ["Callback", "PerformanceEvaluator", "SaveCheckpoint"]
diff --git a/applications/Chat/coati/trainer/callbacks/base.py b/applications/Chat/coati/trainer/callbacks/base.py
index f5616048855b..c6e30f04885c 100644
--- a/applications/Chat/coati/trainer/callbacks/base.py
+++ b/applications/Chat/coati/trainer/callbacks/base.py
@@ -5,7 +5,7 @@
class Callback(ABC):
"""
- Base callback class. It defines the interface for callbacks.
+ Base callback class. It defines the interface for callbacks.
"""
def on_fit_start(self) -> None:
@@ -35,5 +35,5 @@ def on_learn_epoch_end(self, epoch: int) -> None:
def on_learn_batch_start(self) -> None:
pass
- def on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
+ def on_learn_batch_end(self, experience: Experience) -> None:
pass
diff --git a/applications/Chat/coati/trainer/callbacks/performance_evaluator.py b/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
index 9b44dafa7eaa..b286c766c263 100644
--- a/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
+++ b/applications/Chat/coati/trainer/callbacks/performance_evaluator.py
@@ -21,9 +21,9 @@ def print_rank_0(*args, **kwargs) -> None:
def divide(x: float, y: float) -> float:
if y == 0:
- return float('inf')
- elif y == float('inf'):
- return float('nan')
+ return float("inf")
+ elif y == float("inf"):
+ return float("nan")
return x / y
@@ -38,10 +38,9 @@ def all_reduce_mean(x: float, world_size: int) -> float:
class Timer:
-
def __init__(self) -> None:
self.start_time: Optional[float] = None
- self.duration: float = 0.
+ self.duration: float = 0.0
def start(self) -> None:
self.start_time = time()
@@ -52,7 +51,7 @@ def end(self) -> None:
self.start_time = None
def reset(self) -> None:
- self.duration = 0.
+ self.duration = 0.0
class PerformanceEvaluator(Callback):
@@ -67,13 +66,15 @@ class PerformanceEvaluator(Callback):
ignore_episodes: The number of episodes to ignore when calculating the performance.
"""
- def __init__(self,
- actor_num_params: int,
- critic_num_params: int,
- initial_model_num_params: int,
- reward_model_num_params: int,
- enable_grad_checkpoint: bool = False,
- ignore_episodes: int = 0) -> None:
+ def __init__(
+ self,
+ actor_num_params: int,
+ critic_num_params: int,
+ initial_model_num_params: int,
+ reward_model_num_params: int,
+ enable_grad_checkpoint: bool = False,
+ ignore_episodes: int = 0,
+ ) -> None:
super().__init__()
self.world_size = get_world_size()
self.actor_num_params = actor_num_params
@@ -136,7 +137,7 @@ def on_learn_batch_start(self) -> None:
return
self.learn_timer.start()
- def on_learn_batch_end(self, metrics: dict, experience: Experience) -> None:
+ def on_learn_batch_end(self, experience: Experience) -> None:
if self.disable:
return
self.learn_timer.end()
@@ -155,8 +156,9 @@ def on_fit_end(self) -> None:
avg_learn_duration = all_reduce_mean(self.learn_timer.duration, self.world_size)
avg_overall_duration = all_reduce_mean(self.overall_timer.duration, self.world_size)
- avg_make_experience_throughput = self.make_experience_num_samples * \
- self.world_size / (avg_make_experience_duration + 1e-12)
+ avg_make_experience_throughput = (
+ self.make_experience_num_samples * self.world_size / (avg_make_experience_duration + 1e-12)
+ )
avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
avg_learn_throughput = self.learn_num_samples * self.world_size / (avg_learn_duration + 1e-12)
@@ -171,13 +173,11 @@ def on_fit_end(self) -> None:
learn_time_per_sample = divide(avg_learn_duration, num_effective_samples)
print_rank_0(
- f'Performance summary:\n'
- + f'Generate {self.make_experience_num_samples * self.world_size} samples, throughput: {avg_make_experience_throughput:.2f} samples/s, TFLOPS per GPU: {avg_make_experience_tflops:.2f}\n'
-
- + f'Train {self.learn_num_samples * self.world_size} samples, throughput: {avg_learn_throughput:.2f} samples/s, TFLOPS per GPU: {avg_learn_tflops:.2f}\n'
- + f'Overall throughput: {avg_overall_throughput:.2f} samples/s\n'
- + f'Overall time per sample: {overall_time_per_sample:.2f} s\n'
- + f'Make experience time per sample: {make_experience_time_per_sample:.2f} s, {make_experience_time_per_sample/overall_time_per_sample*100:.2f}%\n'
-
- + f'Learn time per sample: {learn_time_per_sample:.2f} s, {learn_time_per_sample/overall_time_per_sample*100:.2f}%'
+ f"Performance summary:\n"
+ + f"Generate {self.make_experience_num_samples * self.world_size} samples, throughput: {avg_make_experience_throughput:.2f} samples/s, TFLOPS per GPU: {avg_make_experience_tflops:.2f}\n"
+ + f"Train {self.learn_num_samples * self.world_size} samples, throughput: {avg_learn_throughput:.2f} samples/s, TFLOPS per GPU: {avg_learn_tflops:.2f}\n"
+ + f"Overall throughput: {avg_overall_throughput:.2f} samples/s\n"
+ + f"Overall time per sample: {overall_time_per_sample:.2f} s\n"
+ + f"Make experience time per sample: {make_experience_time_per_sample:.2f} s, {make_experience_time_per_sample/overall_time_per_sample*100:.2f}%\n"
+ + f"Learn time per sample: {learn_time_per_sample:.2f} s, {learn_time_per_sample/overall_time_per_sample*100:.2f}%"
)
diff --git a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py b/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
index f0d77a191a88..0d70b6c53073 100644
--- a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
+++ b/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
@@ -36,34 +36,35 @@ class SaveCheckpoint(Callback):
"""
- def __init__(self,
- path: str,
- interval: int,
- strategy: Strategy,
- actor: nn.Module = None,
- critic: nn.Module = None,
- actor_optim: Optimizer = None,
- critic_optim: Optimizer = None) -> None:
+ def __init__(
+ self,
+ path: str,
+ interval: int,
+ strategy: Strategy,
+ actor: nn.Module = None,
+ critic: nn.Module = None,
+ actor_optim: Optimizer = None,
+ critic_optim: Optimizer = None,
+ ) -> None:
super().__init__()
- self.path = os.path.join(path, 'checkpoint')
+ self.path = os.path.join(path, "checkpoint")
self.interval = interval
self.strategy = strategy
- self.model_dict = {'actor': [actor, actor_optim], 'critic': [critic, critic_optim]}
+ self.model_dict = {"actor": [actor, actor_optim], "critic": [critic, critic_optim]}
def on_episode_end(self, episode: int) -> None:
if (episode + 1) % self.interval != 0:
return
- base_path = os.path.join(self.path, f'episode_{episode}')
+ base_path = os.path.join(self.path, f"episode_{episode}")
if not os.path.exists(base_path):
os.makedirs(base_path)
for model in self.model_dict.keys():
-
# save model
if self.model_dict[model][0] is None:
# saving only optimizer states is meaningless, so it would be skipped
continue
- model_path = os.path.join(base_path, f'{model}.pt')
+ model_path = os.path.join(base_path, f"{model}.pt")
self.strategy.save_model(model=self.model_dict[model][0], path=model_path, only_rank0=True)
# save optimizer
@@ -71,5 +72,5 @@ def on_episode_end(self, episode: int) -> None:
continue
only_rank0 = not isinstance(self.strategy, (LowLevelZeroStrategy, GeminiStrategy))
rank = 0 if is_rank_0() else dist.get_rank()
- optim_path = os.path.join(base_path, f'{model}-optim-rank-{rank}.pt')
+ optim_path = os.path.join(base_path, f"{model}-optim-rank-{rank}.pt")
self.strategy.save_optimizer(optimizer=self.model_dict[model][1], path=optim_path, only_rank0=only_rank0)
diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
index ef625a1c1b3d..d6966689885e 100644
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@@ -1,34 +1,33 @@
-from typing import Dict, List
+from typing import Dict, List, Optional
-import torch.nn as nn
from coati.experience_buffer import NaiveExperienceBuffer
from coati.experience_maker import Experience, NaiveExperienceMaker
-from coati.models.base import Actor, Critic, get_base_model
+from coati.models.base import Actor, Critic, RewardModel, get_base_model
from coati.models.loss import GPTLMLoss, PolicyLoss, ValueLoss
from coati.models.utils import calc_action_log_probs
-from torch import Tensor
from torch.optim import Optimizer
from torch.utils.data import DataLoader, DistributedSampler
from tqdm import tqdm
+from transformers import PreTrainedTokenizerBase
from colossalai.utils import get_current_device
from .base import OnPolicyTrainer
from .callbacks import Callback
from .strategies import GeminiStrategy, Strategy
-from .utils import is_rank_0, to_device
+from .utils import CycledDataLoader, is_rank_0, to_device
def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> Dict:
- unwrapper_model = strategy.unwrap_model(actor)
- hf_model = get_base_model(unwrapper_model)
+ unwrapped_model = strategy.unwrap_model(actor)
+ hf_model = get_base_model(unwrapped_model)
new_kwargs = {**generate_kwargs}
# use huggingface models method directly
- if 'prepare_inputs_fn' not in generate_kwargs and hasattr(hf_model, 'prepare_inputs_for_generation'):
- new_kwargs['prepare_inputs_fn'] = hf_model.prepare_inputs_for_generation
+ if "prepare_inputs_fn" not in generate_kwargs and hasattr(hf_model, "prepare_inputs_for_generation"):
+ new_kwargs["prepare_inputs_fn"] = hf_model.prepare_inputs_for_generation
- if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(hf_model, '_update_model_kwargs_for_generation'):
- new_kwargs['update_model_kwargs_fn'] = hf_model._update_model_kwargs_for_generation
+ if "update_model_kwargs_fn" not in generate_kwargs and hasattr(hf_model, "_update_model_kwargs_for_generation"):
+ new_kwargs["update_model_kwargs_fn"] = hf_model._update_model_kwargs_for_generation
return new_kwargs
@@ -41,7 +40,7 @@ class PPOTrainer(OnPolicyTrainer):
strategy (Strategy): the strategy to use for training
actor (Actor): the actor model in ppo algorithm
critic (Critic): the critic model in ppo algorithm
- reward_model (nn.Module): the reward model in rlhf algorithm to make reward of sentences
+ reward_model (RewardModel): the reward model in rlhf algorithm to make reward of sentences
initial_model (Actor): the initial model in rlhf algorithm to generate reference logics to limit the update of actor
actor_optim (Optimizer): the optimizer to use for actor model
critic_optim (Optimizer): the optimizer to use for critic model
@@ -60,45 +59,42 @@ class PPOTrainer(OnPolicyTrainer):
generate_kwargs (dict, optional): the kwargs to use while model generating
"""
- def __init__(self,
- strategy: Strategy,
- actor: Actor,
- critic: Critic,
- reward_model: nn.Module,
- initial_model: Actor,
- actor_optim: Optimizer,
- critic_optim: Optimizer,
- kl_coef: float = 0.1,
- ptx_coef: float = 0.9,
- train_batch_size: int = 8,
- buffer_limit: int = 0,
- buffer_cpu_offload: bool = True,
- eps_clip: float = 0.2,
- vf_coef: float = 1.0,
- value_clip: float = 0.4,
- sample_buffer: bool = False,
- dataloader_pin_memory: bool = True,
- offload_inference_models: bool = True,
- callbacks: List[Callback] = [],
- **generate_kwargs
- ) -> None:
+ def __init__(
+ self,
+ strategy: Strategy,
+ actor: Actor,
+ critic: Critic,
+ reward_model: RewardModel,
+ initial_model: Actor,
+ actor_optim: Optimizer,
+ critic_optim: Optimizer,
+ tokenizer: PreTrainedTokenizerBase,
+ kl_coef: float = 0.1,
+ ptx_coef: float = 0.9,
+ train_batch_size: int = 8,
+ buffer_limit: int = 0,
+ buffer_cpu_offload: bool = True,
+ eps_clip: float = 0.2,
+ vf_coef: float = 1.0,
+ value_clip: float = 0.4,
+ sample_buffer: bool = False,
+ dataloader_pin_memory: bool = True,
+ offload_inference_models: bool = True,
+ callbacks: List[Callback] = [],
+ **generate_kwargs,
+ ) -> None:
if isinstance(strategy, GeminiStrategy):
- assert not offload_inference_models, \
- "GeminiPlugin is not compatible with manual model.to('cpu')"
+ assert not offload_inference_models, "GeminiPlugin is not compatible with manual model.to('cpu')"
data_buffer = NaiveExperienceBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
- super().__init__(
- strategy, data_buffer,
- sample_buffer, dataloader_pin_memory,
- callbacks
- )
+ super().__init__(strategy, data_buffer, sample_buffer, dataloader_pin_memory, callbacks)
self.generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
- self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, kl_coef)
- self.offload_inference_models = offload_inference_models
+ self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, tokenizer, kl_coef)
self.actor = actor
self.critic = critic
+ self.tokenizer = tokenizer
self.actor_loss_fn = PolicyLoss(eps_clip)
self.critic_loss_fn = ValueLoss(value_clip)
@@ -108,84 +104,99 @@ def __init__(self,
self.actor_optim = actor_optim
self.critic_optim = critic_optim
+ self.offload_inference_models = offload_inference_models
self.device = get_current_device()
+ def _before_fit(
+ self,
+ prompt_dataloader: DataLoader,
+ pretrain_dataloader: DataLoader,
+ log_dir: Optional[str] = None,
+ use_wandb: bool = False,
+ ):
+ """
+ Args:
+ prompt_dataloader (DataLoader): the dataloader to use for prompt data
+ pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
+ """
+ self.prompt_dataloader = CycledDataLoader(prompt_dataloader)
+ self.pretrain_dataloader = CycledDataLoader(pretrain_dataloader)
+
+ self.writer = None
+ if use_wandb and is_rank_0():
+ assert log_dir is not None, "log_dir must be provided when use_wandb is True"
+ import wandb
+
+ wandb.init(project="Coati-ppo", sync_tensorboard=True)
+ if log_dir is not None and is_rank_0():
+ import os
+ import time
+
+ from torch.utils.tensorboard import SummaryWriter
+
+ log_dir = os.path.join(log_dir, "ppo")
+ log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
+ self.writer = SummaryWriter(log_dir=log_dir)
+
def _make_experience(self, collect_step: int) -> Experience:
prompts = self.prompt_dataloader.next()
if self.offload_inference_models:
# TODO(ver217): this may be controlled by strategy if they are prepared by strategy
self.experience_maker.initial_model.to(self.device)
self.experience_maker.reward_model.to(self.device)
- if isinstance(prompts, Tensor):
- return self.experience_maker.make_experience(prompts, **self.generate_kwargs)
- elif isinstance(prompts, dict):
- return self.experience_maker.make_experience(**prompts, **self.generate_kwargs)
- else:
- raise ValueError(f'Unsupported input type "{type(prompts)}"')
+ assert isinstance(prompts, dict), f'Unsupported input type "{type(prompts)}"'
+ return self.experience_maker.make_experience(**prompts, **self.generate_kwargs)
- def _training_step(self, experience: Experience) -> Dict[str, float]:
+ def _training_step(self, experience: Experience):
self.actor.train()
self.critic.train()
# policy loss
- num_actions = experience.action_mask.size(1)
- actor_output = self.actor(experience.sequences, attention_mask=experience.attention_mask)
- action_log_probs = calc_action_log_probs(actor_output, experience.sequences, num_actions)
- actor_loss = self.actor_loss_fn(action_log_probs,
- experience.action_log_probs,
- experience.advantages,
- action_mask=experience.action_mask)
+ num_actions = experience.action_log_probs.size(1)
+ actor_logits = self.actor(experience.sequences, experience.attention_mask)["logits"]
+ action_log_probs = calc_action_log_probs(actor_logits, experience.sequences, num_actions)
+ actor_loss = self.actor_loss_fn(
+ action_log_probs, experience.action_log_probs, experience.advantages, action_mask=experience.action_mask
+ )
+ actor_loss = (1 - self.ptx_coef) * actor_loss
+ self.strategy.backward(actor_loss, self.actor, self.actor_optim)
# ptx loss
if self.ptx_coef != 0:
batch = self.pretrain_dataloader.next()
batch = to_device(batch, self.device)
- ptx_log_probs = self.actor(batch['input_ids'],
- attention_mask=batch['attention_mask'])['logits']
- ptx_loss = self.ptx_loss_fn(ptx_log_probs, batch['labels'])
- actor_loss = ptx_loss * self.ptx_coef + actor_loss * (1 - self.ptx_coef)
+ ptx_log_probs = self.actor(batch["input_ids"], batch["attention_mask"])["logits"]
+ ptx_loss = self.ptx_coef * self.ptx_loss_fn(ptx_log_probs, batch["labels"])
+ self.strategy.backward(ptx_loss, self.actor, self.actor_optim)
- self.strategy.backward(actor_loss, self.actor, self.actor_optim)
self.strategy.optimizer_step(self.actor_optim)
self.actor_optim.zero_grad()
# value loss
- values = self.critic(experience.sequences,
- action_mask=experience.action_mask,
- attention_mask=experience.attention_mask)
- critic_loss = self.critic_loss_fn(values,
- experience.values,
- experience.reward,
- action_mask=experience.action_mask)
+ values = self.critic(experience.sequences, attention_mask=experience.attention_mask)
+ critic_loss = self.critic_loss_fn(values, experience.values, experience.reward)
critic_loss = critic_loss * self.vf_coef
self.strategy.backward(critic_loss, self.critic, self.critic_optim)
self.strategy.optimizer_step(self.critic_optim)
self.critic_optim.zero_grad()
- return {'reward': experience.reward.mean().item()}
-
def _learn(self, update_step: int):
if self.offload_inference_models:
- self.experience_maker.initial_model.to('cpu')
- self.experience_maker.reward_model.to('cpu')
+ self.experience_maker.initial_model.to("cpu")
+ self.experience_maker.reward_model.to("cpu")
# buffer may be empty at first, we should rebuild at each training
if self.sample_buffer:
experience = self.data_buffer.sample()
self._on_learn_batch_start()
experience.to_device(self.device)
- metrics = self._training_step(experience)
- self._on_learn_batch_end(metrics, experience)
+ self._training_step(experience)
+ self._on_learn_batch_end(experience)
else:
if isinstance(self.dataloader.sampler, DistributedSampler):
self.dataloader.sampler.set_epoch(update_step)
- pbar = tqdm(
- self.dataloader,
- desc=f'Train epoch [{update_step + 1}]',
- disable=not is_rank_0()
- )
+ pbar = tqdm(self.dataloader, desc=f"Train epoch [{update_step + 1}]", disable=not is_rank_0())
for experience in pbar:
self._on_learn_batch_start()
experience.to_device(self.device)
- metrics = self._training_step(experience)
- self._on_learn_batch_end(metrics, experience)
- pbar.set_postfix(metrics)
+ self._training_step(experience)
+ self._on_learn_batch_end(experience)
diff --git a/applications/Chat/coati/trainer/rm.py b/applications/Chat/coati/trainer/rm.py
index 54a5d0f40dea..d7f8c21a5a3d 100644
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@@ -1,7 +1,5 @@
-from datetime import datetime
-from typing import Callable
+from typing import Callable, Optional
-import pandas as pd
import torch
import tqdm
from torch.optim import Optimizer
@@ -40,10 +38,12 @@ def __init__(
self.loss_fn = loss_fn
self.scheduler = lr_scheduler
+ self.num_train_step = 0
+
def _eval(self, epoch):
if self.eval_dataloader is not None:
self.model.eval()
- dist, on, cnt = 0, 0, 0
+ dist, num_correct, num_samples = 0, 0, 0
with torch.no_grad():
for chosen_ids, c_mask, reject_ids, r_mask in self.eval_dataloader:
chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
@@ -52,30 +52,21 @@ def _eval(self, epoch):
r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
reject_reward = self.model(reject_ids, attention_mask=r_mask)
- for i in range(len(chosen_reward)):
- cnt += 1
- if chosen_reward[i] > reject_reward[i]:
- on += 1
+ num_samples += chosen_ids.size(0)
+ num_correct += (chosen_reward > reject_reward).sum().item()
dist += (chosen_reward - reject_reward).mean().item()
self.dist = dist / len(self.eval_dataloader)
- self.acc = on / cnt
+ self.acc = num_correct / num_samples
- if is_rank_0():
- log = pd.DataFrame(
- [[(epoch + 1) * len(self.train_dataloader),
- self.loss.item(), self.dist, self.acc]],
- columns=['step', 'loss', 'dist', 'acc']
- )
- log.to_csv('log.csv', mode='a', header=False, index=False)
+ if self.writer:
+ self.writer.add_scalar("eval/dist", self.dist, epoch)
+ self.writer.add_scalar("eval/acc", self.acc, epoch)
def _train(self, epoch):
self.model.train()
step_bar = tqdm.trange(
- len(self.train_dataloader),
- desc='Train step of epoch %d' % epoch,
- disable=not is_rank_0()
+ len(self.train_dataloader), desc=f"Epoch {epoch + 1}/{self.max_epochs}", disable=not is_rank_0()
)
- cnt = 0
for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
@@ -83,29 +74,50 @@ def _train(self, epoch):
r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
reject_reward = self.model(reject_ids, attention_mask=r_mask)
- self.loss = self.loss_fn(chosen_reward, reject_reward)
- self.strategy.backward(self.loss, self.model, self.optimizer)
+ loss = self.loss_fn(chosen_reward, reject_reward)
+ self.strategy.backward(loss, self.model, self.optimizer)
self.strategy.optimizer_step(self.optimizer)
self.optimizer.zero_grad()
- cnt += 1
- if cnt % 100 == 0:
+ if self.writer:
+ self.writer.add_scalar("train/loss", loss.item(), self.num_train_step)
+ self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
+ self.writer.add_scalar("train/dist", (chosen_reward - reject_reward).mean().item(), self.num_train_step)
+ self.writer.add_scalar(
+ "train/acc", (chosen_reward > reject_reward).float().mean().item(), self.num_train_step
+ )
+ self.num_train_step += 1
+ if self.num_train_step % 100 == 0:
self.scheduler.step()
step_bar.update()
step_bar.close()
- def _before_fit(self,
- train_dataloader: DataLoader,
- valid_dataloader: DataLoader,
- eval_dataloader: DataLoader):
+ def _before_fit(
+ self,
+ train_dataloader: DataLoader,
+ eval_dataloader: DataLoader,
+ log_dir: Optional[str] = None,
+ use_wandb: bool = False,
+ ):
"""
Args:
train_dataloader (DataLoader): the dataloader to use for training
- valid_dataloader (DataLoader): the dataloader to use for validation
eval_dataloader (DataLoader): the dataloader to use for evaluation
"""
- super()._before_fit()
- self.datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
-
self.train_dataloader = train_dataloader
- self.valid_dataloader = valid_dataloader
self.eval_dataloader = eval_dataloader
+
+ self.writer = None
+ if use_wandb and is_rank_0():
+ assert log_dir is not None, "log_dir must be provided when use_wandb is True"
+ import wandb
+
+ wandb.init(project="Coati-rm", sync_tensorboard=True)
+ if log_dir is not None and is_rank_0():
+ import os
+ import time
+
+ from torch.utils.tensorboard import SummaryWriter
+
+ log_dir = os.path.join(log_dir, "rm")
+ log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
+ self.writer = SummaryWriter(log_dir=log_dir)
diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py
index e4d0a970740d..7d0eeec897e5 100644
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@@ -1,10 +1,8 @@
-import time
from typing import Optional
import torch
import torch.distributed as dist
import tqdm
-import wandb
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import DataLoader
@@ -39,48 +37,43 @@ def __init__(
accumulation_steps: int = 8,
) -> None:
if accumulation_steps > 1:
- assert not isinstance(strategy, GeminiStrategy), \
- "Accumulation steps are not supported in stage 3 of ColossalAI"
+ assert not isinstance(
+ strategy, GeminiStrategy
+ ), "Accumulation steps are not supported in stage 3 of ColossalAI"
super().__init__(strategy, max_epochs, model, optim)
self.accumulation_steps = accumulation_steps
self.scheduler = lr_scheduler
+ self.num_train_step = 0
+ self.num_eval_step = 0
+
def _train(self, epoch: int):
self.model.train()
- for batch_id, batch in enumerate(self.train_dataloader):
-
+ step_bar = tqdm.trange(
+ len(self.train_dataloader) // self.accumulation_steps,
+ desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+ disable=not is_rank_0(),
+ )
+ for i, batch in enumerate(self.train_dataloader):
batch = to_device(batch, torch.cuda.current_device())
- if "attention_mask" in batch:
- outputs = self.model(batch["input_ids"],
- attention_mask=batch["attention_mask"],
- labels=batch["labels"])
- else:
- outputs = self.model(batch["input_ids"],
- labels=batch["labels"])
-
- loss = outputs.loss
- loss = loss / self.accumulation_steps
-
- self.strategy.backward(loss, self.model, self.optimizer)
-
+ outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
+ loss = outputs.loss / self.accumulation_steps
self.total_loss += loss.item()
-
+ self.strategy.backward(loss, self.model, self.optimizer)
# gradient accumulation
- if (batch_id + 1) % self.accumulation_steps == 0:
+ if (i + 1) % self.accumulation_steps == 0:
self.strategy.optimizer_step(self.optimizer)
self.optimizer.zero_grad()
self.scheduler.step()
- if is_rank_0() and self.use_wandb:
- wandb.log({
- "loss": self.total_loss / self.accumulation_steps,
- "lr": self.scheduler.get_last_lr()[0],
- "epoch": epoch,
- "batch_id": batch_id
- })
+ if self.writer:
+ self.writer.add_scalar("train/loss", self.total_loss, self.num_train_step)
+ self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step)
+ self.num_train_step += 1
self.total_loss = 0
- self.step_bar.update()
+ step_bar.update()
+ step_bar.close()
def _eval(self, epoch: int):
if self.eval_dataloader is not None:
@@ -89,23 +82,26 @@ def _eval(self, epoch: int):
loss_sum, num_seen = 0, 0
for batch in self.eval_dataloader:
batch = to_device(batch, torch.cuda.current_device())
- outputs = self.model(batch["input_ids"],
- attention_mask=batch["attention_mask"],
- labels=batch["labels"])
- loss = outputs.loss
-
- loss_sum += loss.item()
+ outputs = self.model(
+ batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"]
+ )
+ loss_sum += outputs.loss.item()
num_seen += batch["input_ids"].size(0)
-
loss_mean = loss_sum / num_seen
if dist.get_rank() == 0:
- self.logger.info(f'Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}')
+ self.logger.info(f"Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}")
+ if self.writer:
+ self.writer.add_scalar("eval/loss", loss_mean, self.num_eval_step)
+ self.num_eval_step += 1
- def _before_fit(self,
- train_dataloader: DataLoader,
- eval_dataloader: Optional[DataLoader] = None,
- logger: Optional[DistributedLogger] = None,
- use_wandb: bool = False):
+ def _before_fit(
+ self,
+ train_dataloader: DataLoader,
+ eval_dataloader: Optional[DataLoader] = None,
+ logger: Optional[DistributedLogger] = None,
+ log_dir: Optional[str] = None,
+ use_wandb: bool = False,
+ ):
"""
Args:
train_dataloader: the dataloader to use for training
@@ -115,15 +111,20 @@ def _before_fit(self,
self.eval_dataloader = eval_dataloader
self.logger = logger
- self.use_wandb = use_wandb
- if use_wandb:
- wandb.init(project="Coati", name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
- wandb.watch(self.model)
+ self.writer = None
+ if use_wandb and is_rank_0():
+ assert log_dir is not None, "log_dir must be provided when use_wandb is True"
+ import wandb
+
+ wandb.init(project="Coati-sft", sync_tensorboard=True)
+ if log_dir is not None and is_rank_0():
+ import os
+ import time
+
+ from torch.utils.tensorboard import SummaryWriter
+
+ log_dir = os.path.join(log_dir, "sft")
+ log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
+ self.writer = SummaryWriter(log_dir=log_dir)
self.total_loss = 0
- self.no_epoch_bar = True
- self.step_bar = tqdm.trange(
- len(self.train_dataloader) // self.accumulation_steps * self.max_epochs,
- desc=f'steps',
- disable=not is_rank_0()
- )
diff --git a/applications/Chat/coati/trainer/strategies/__init__.py b/applications/Chat/coati/trainer/strategies/__init__.py
index b49a2c742db3..521dcb5855b1 100644
--- a/applications/Chat/coati/trainer/strategies/__init__.py
+++ b/applications/Chat/coati/trainer/strategies/__init__.py
@@ -2,7 +2,4 @@
from .colossalai import GeminiStrategy, LowLevelZeroStrategy
from .ddp import DDPStrategy
-__all__ = [
- 'Strategy', 'DDPStrategy',
- 'LowLevelZeroStrategy', 'GeminiStrategy'
-]
+__all__ = ["Strategy", "DDPStrategy", "LowLevelZeroStrategy", "GeminiStrategy"]
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index c20b2b16e396..a78716216ae0 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -19,7 +19,7 @@
class Strategy(ABC):
"""
- Base class for training strategies.
+ Base class for training strategies.
"""
def __init__(self, plugin_initializer: Callable[..., Optional[Plugin]] = lambda: None) -> None:
@@ -83,16 +83,18 @@ def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _Boo
rets.append((model, optimizer))
elif isinstance(arg, Dict):
model, optimizer, criterion, dataloader, lr_scheduler = self.booster.boost(**arg)
- boost_result = dict(model=model,
- optimizer=optimizer,
- criterion=criterion,
- dataloader=dataloader,
- lr_scheduler=lr_scheduler)
+ boost_result = dict(
+ model=model,
+ optimizer=optimizer,
+ criterion=criterion,
+ dataloader=dataloader,
+ lr_scheduler=lr_scheduler,
+ )
# remove None values
boost_result = {key: value for key, value in boost_result.items() if value is not None}
rets.append(boost_result)
else:
- raise RuntimeError(f'Type {type(arg)} is not supported')
+ raise RuntimeError(f"Type {type(arg)} is not supported")
return rets[0] if len(rets) == 1 else rets
@@ -108,8 +110,8 @@ def unwrap_model(model: nn.Module) -> nn.Module:
"""
return model
- def save_model(self, model: nn.Module, path: str, only_rank0: bool = True, **kwargs) -> None:
- self.booster.save_model(model, path, shard=not only_rank0, **kwargs)
+ def save_model(self, model: nn.Module, path: str, shard: bool = False, **kwargs) -> None:
+ self.booster.save_model(model, path, shard=shard, **kwargs)
def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None:
self.booster.load_model(model, path, strict)
@@ -125,11 +127,9 @@ def setup_sampler(self, dataset) -> DistributedSampler:
return DistributedSampler(dataset, 1, 0)
@abstractmethod
- def save_pretrained(self,
- model: nn.Module,
- path: str,
- only_rank0: bool = True,
- tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
+ def save_pretrained(
+ self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
+ ) -> None:
pass
@abstractmethod
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index fa55f97ad661..7129edb060ef 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -1,17 +1,12 @@
import warnings
from typing import Optional
-import torch
-import torch.distributed as dist
import torch.nn as nn
import colossalai
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
-from colossalai.booster.plugin.gemini_plugin import GeminiModel
from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.tensor import ProcessGroup, ShardSpec
from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext
from colossalai.zero.gemini.gemini_ddp import GeminiDDP
from .ddp import DDPStrategy
@@ -42,37 +37,34 @@ class LowLevelZeroStrategy(DDPStrategy):
"""
- def __init__(self,
- stage: int = 2,
- precision: str = 'fp16',
- seed: int = 42,
- placement_policy: str = 'cuda',
- reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2
- overlap_communication: bool = True, # only for stage 1&2
- initial_scale: float = 2**16,
- growth_factor: float = 2,
- backoff_factor: float = 0.5,
- growth_interval: int = 1000,
- hysteresis: int = 2,
- min_scale: float = 1,
- max_scale: float = 2**32,
- max_norm: float = 0.0,
- norm_type: float = 2.0
- ) -> None:
-
+ def __init__(
+ self,
+ stage: int = 2,
+ precision: str = "fp16",
+ seed: int = 42,
+ placement_policy: str = "cuda",
+ reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2
+ overlap_communication: bool = True, # only for stage 1&2
+ initial_scale: float = 2**16,
+ growth_factor: float = 2,
+ backoff_factor: float = 0.5,
+ growth_interval: int = 1000,
+ hysteresis: int = 2,
+ min_scale: float = 1,
+ max_scale: float = 2**32,
+ max_norm: float = 0.0,
+ norm_type: float = 2.0,
+ ) -> None:
assert stage in (1, 2), f'Unsupported stage "{stage}"'
- assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
- assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"'
+ assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
+ assert precision in ("fp32", "fp16"), f'Unsupported precision "{precision}"'
plugin_initializer = lambda: LowLevelZeroPlugin(
- # zero_config
stage=stage,
precision=precision,
- # zero_optim_config
reduce_bucket_size_in_m=reduce_bucket_size,
overlap_communication=overlap_communication,
- cpu_offload=(placement_policy == 'cpu'),
- # optim_config
+ cpu_offload=(placement_policy == "cpu"),
initial_scale=initial_scale,
growth_factor=growth_factor,
backoff_factor=backoff_factor,
@@ -81,14 +73,15 @@ def __init__(self,
min_scale=min_scale,
max_scale=max_scale,
max_norm=max_norm,
- norm_type=norm_type
+ norm_type=norm_type,
)
super().__init__(seed, plugin_initializer)
def _post_init(self) -> None:
- assert isinstance(self.plugin, LowLevelZeroPlugin), \
- f'{type(self).__name__}\'s plugin is not initialized properly.'
+ assert isinstance(
+ self.plugin, LowLevelZeroPlugin
+ ), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
colossalai.launch_from_torch({}, seed=self.seed)
@@ -131,54 +124,55 @@ class GeminiStrategy(DDPStrategy):
"""
- def __init__(self,
- seed: int = 42,
- shard_init: bool = False, # only for stage 3
- placement_policy: str = 'cuda',
- pin_memory: bool = True, # only for stage 3
- force_outputs_fp32: bool = False, # only for stage 3
- search_range_m: int = 32, # only for stage 3
- hidden_dim: Optional[int] = None, # only for stage 3
- min_chunk_size_m: float = 32, # only for stage 3
- gpu_margin_mem_ratio: float = 0.0, # only for stage 3
- initial_scale: float = 2**16,
- growth_factor: float = 2,
- backoff_factor: float = 0.5,
- growth_interval: int = 1000,
- hysteresis: int = 2,
- min_scale: float = 1,
- max_scale: float = 2**32,
- max_norm: float = 0.0,
- norm_type: float = 2.0
- ) -> None:
-
- assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"'
-
+ def __init__(
+ self,
+ seed: int = 42,
+ shard_init: bool = False, # only for stage 3
+ placement_policy: str = "auto",
+ shard_param_frac: float = 1.0, # only for static placement
+ offload_optim_frac: float = 0.0, # only for static placement
+ offload_param_frac: float = 0.0, # only for static placement
+ pin_memory: bool = True, # only for stage 3
+ force_outputs_fp32: bool = False, # only for stage 3
+ search_range_m: int = 32, # only for stage 3
+ hidden_dim: Optional[int] = None, # only for stage 3
+ min_chunk_size_m: float = 32, # only for stage 3
+ gpu_margin_mem_ratio: float = 0.0, # only for stage 3
+ initial_scale: float = 2**16,
+ growth_factor: float = 2,
+ backoff_factor: float = 0.5,
+ growth_interval: int = 1000,
+ hysteresis: int = 2,
+ min_scale: float = 1,
+ max_scale: float = 2**32,
+ max_norm: float = 0.0,
+ norm_type: float = 2.0,
+ ) -> None:
# TODO(ver217): support shard_init when using from_pretrained()
if shard_init:
warnings.warn(
- f'Shard init is not supported model.from_pretrained() yet. '
- 'Please load weights after strategy.prepare()'
+ f"Shard init is not supported model.from_pretrained() yet. "
+ "Please load weights after strategy.prepare()"
)
self.shard_init = shard_init
- warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.')
+ warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")
# NOTE: dist should be initialized before calling get_current_device()
plugin_initializer = lambda: GeminiPlugin(
- # gemini_config
- device=get_current_device(),
+ chunk_init_device=get_current_device(),
placement_policy=placement_policy,
- precision='fp16',
+ shard_param_frac=shard_param_frac,
+ offload_optim_frac=offload_optim_frac,
+ offload_param_frac=offload_param_frac,
+ precision="fp16",
pin_memory=pin_memory,
force_outputs_fp32=force_outputs_fp32,
strict_ddp_mode=shard_init,
search_range_m=search_range_m,
hidden_dim=hidden_dim,
min_chunk_size_m=min_chunk_size_m,
- # zero_optim_config
gpu_margin_mem_ratio=gpu_margin_mem_ratio,
- # optim_config
initial_scale=initial_scale,
growth_factor=growth_factor,
backoff_factor=backoff_factor,
@@ -187,29 +181,20 @@ def __init__(self,
min_scale=min_scale,
max_scale=max_scale,
max_norm=max_norm,
- norm_type=norm_type
+ norm_type=norm_type,
)
super().__init__(seed, plugin_initializer)
def _post_init(self) -> None:
- assert isinstance(self.plugin, GeminiPlugin), \
- f'{type(self).__name__}\'s plugin is not initialized properly.'
+ assert isinstance(self.plugin, GeminiPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
colossalai.launch_from_torch({}, seed=self.seed)
def model_init_context(self):
- world_size = dist.get_world_size()
- shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None
- default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None
- return ColoInitContext(device=get_current_device(),
- dtype=torch.half,
- default_pg=shard_pg,
- default_dist_spec=default_dist_spec)
+ return super().model_init_context()
def unwrap_model(self, model: nn.Module) -> nn.Module:
- assert isinstance(model, GeminiModel)
- ddp_model = model.unwrap()
- assert isinstance(ddp_model, GeminiDDP)
- return ddp_model.module
+ assert isinstance(model, GeminiDDP)
+ return model.module
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index a52b0460daa8..f2a44aeb0961 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -31,24 +31,21 @@ def get_grad_required_state_dict(model: nn.Module):
class DDPStrategy(Strategy):
"""
- Strategy for distributed training using torch.distributed.
+ Strategy for distributed training using torch.distributed.
"""
- def __init__(self,
- seed: int = 42,
- plugin_initializer: Callable = TorchDDPPlugin
- ) -> None:
+ def __init__(self, seed: int = 42, plugin_initializer: Callable = TorchDDPPlugin) -> None:
self.seed = seed
super().__init__(plugin_initializer)
def _try_init_dist(self, force: bool = False) -> None:
try:
- rank = int(os.environ['RANK'])
- local_rank = int(os.environ['LOCAL_RANK'])
- world_size = int(os.environ['WORLD_SIZE'])
- host = os.environ['MASTER_ADDR']
- port = int(os.environ['MASTER_PORT'])
- dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+ rank = int(os.environ["RANK"])
+ local_rank = int(os.environ["LOCAL_RANK"])
+ world_size = int(os.environ["WORLD_SIZE"])
+ host = os.environ["MASTER_ADDR"]
+ port = int(os.environ["MASTER_PORT"])
+ dist.init_process_group("nccl", init_method=f"tcp://[{host}]:{port}", world_size=world_size, rank=rank)
torch.cuda.set_device(local_rank)
except KeyError as e:
if force:
@@ -60,8 +57,7 @@ def _try_init_dist(self, force: bool = False) -> None:
raise e
def _post_init(self) -> None:
- assert isinstance(self.plugin, TorchDDPPlugin), \
- f'{type(self).__name__}\'s plugin is not initialized properly.'
+ assert isinstance(self.plugin, TorchDDPPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
def setup_distributed(self) -> None:
self._try_init_dist(force=True)
@@ -73,12 +69,14 @@ def set_seed(self, seed: int) -> None:
torch.manual_seed(seed)
def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = False) -> DataLoader:
- return self.plugin.prepare_dataloader(data_buffer,
- batch_size=data_buffer.sample_batch_size,
- shuffle=True,
- drop_last=True,
- pin_memory=pin_memory,
- collate_fn=data_buffer.collate_fn)
+ return self.plugin.prepare_dataloader(
+ data_buffer,
+ batch_size=data_buffer.sample_batch_size,
+ shuffle=True,
+ drop_last=True,
+ pin_memory=pin_memory,
+ collate_fn=data_buffer.collate_fn,
+ )
def setup_sampler(self, dataset) -> DistributedSampler:
# FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
@@ -88,12 +86,10 @@ def unwrap_model(self, model: nn.Module) -> nn.Module:
assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel."
return model.unwrap()
- def save_pretrained(self,
- model: nn.Module,
- path: str,
- only_rank0: bool = True,
- tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
- if not only_rank0 or dist.get_rank() == 0:
+ def save_pretrained(
+ self, model: nn.Module, path: str, shard: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None
+ ) -> None:
+ if dist.get_rank() == 0:
unwrapped_model = self.unwrap_model(model)
assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
pretrained_model = unwrapped_model.model
@@ -102,35 +98,29 @@ def save_pretrained(self,
pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
if tokenizer is not None:
tokenizer.save_pretrained(path)
- model_path = os.path.join(path, "pytorch_model.bin")
- self.save_model(model,
- model_path,
- only_rank0=only_rank0)
- def _replace_keys(model_path: str,
- replace_fn: Callable):
+ model_path = os.path.join(path, "pytorch_model.bin")
+ self.save_model(model, model_path, shard=shard)
+ def _replace_keys(model_path: str, replace_fn: Callable):
state_dict = torch.load(model_path, map_location="cpu")
- state_dict = {
- replace_fn(k): v
- for k, v in state_dict.items()
- }
+ state_dict = {replace_fn(k): v for k, v in state_dict.items()}
torch.save(state_dict, model_path)
-
# FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
# HACK: rename keys of pytorch_model.bin
if dist.get_rank() == 0:
_replace_keys(model_path, lambda k: k.replace("model.", "", 1))
+
def get_model_state_dict_shard(self, model: nn.Module, **config):
# TODO: implement sharding on naive strategy
model = self.unwrap_model(model)
- if 'requires_grad_only' in config and config['requires_grad_only'] == True:
+ if "requires_grad_only" in config and config["requires_grad_only"] == True:
state_dict = get_grad_required_state_dict(model)
else:
state_dict = model.state_dict()
- if 'shard_size' in config:
- shard_size = config['shard_size']
+ if "shard_size" in config:
+ shard_size = config["shard_size"]
accumulate_size = 0
state_dict_shard = OrderedDict()
for name, param in state_dict.items():
diff --git a/applications/Chat/coati/trainer/strategies/sampler.py b/applications/Chat/coati/trainer/strategies/sampler.py
index d726fa640fa2..6e811bef11a5 100644
--- a/applications/Chat/coati/trainer/strategies/sampler.py
+++ b/applications/Chat/coati/trainer/strategies/sampler.py
@@ -4,7 +4,6 @@
class DistributedSampler:
-
def __init__(self, dataset, num_replicas: int, rank: int) -> None:
self.dataset = dataset
self.num_replicas = num_replicas
@@ -12,7 +11,7 @@ def __init__(self, dataset, num_replicas: int, rank: int) -> None:
if len(self.dataset) % self.num_replicas != 0:
self.num_samples = math.ceil(
- (len(self.dataset) - self.num_replicas) / self.num_replicas # type: ignore[arg-type]
+ (len(self.dataset) - self.num_replicas) / self.num_replicas # type: ignore[arg-type]
)
else:
self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
@@ -20,10 +19,10 @@ def __init__(self, dataset, num_replicas: int, rank: int) -> None:
self.total_size = self.num_samples * self.num_replicas
indices = list(range(len(self.dataset)))
- indices = indices[:self.total_size]
+ indices = indices[: self.total_size]
assert len(indices) == self.total_size
# subsample
- indices = indices[self.rank:self.total_size:self.num_replicas]
+ indices = indices[self.rank : self.total_size : self.num_replicas]
assert len(indices) == self.num_samples
self.indices = indices
diff --git a/applications/Chat/coati/trainer/utils.py b/applications/Chat/coati/trainer/utils.py
index 7e2cb9c634f7..7811e7365eeb 100644
--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@@ -42,7 +42,6 @@ def is_rank_0() -> bool:
def to_device(x: Any, device: torch.device) -> Any:
-
def _to(t: Any):
if isinstance(t, torch.Tensor):
return t.to(device)
diff --git a/applications/Chat/evaluate/README.md b/applications/Chat/evaluate/README.md
deleted file mode 100644
index 0a97ae72f9d0..000000000000
--- a/applications/Chat/evaluate/README.md
+++ /dev/null
@@ -1,396 +0,0 @@
-# Evaluation
-
-In this directory, we introduce how you can evaluate your model with our pipeline. This pipeline is now available for evaluation of both Chinese and English capability.
-
-## Installation
-
-To start model evaluation, you need to install required packages which listed in `requirements.txt` under `evaluate` folder.
-
-```shell
-pip install -r requirements.txt
-```
-
-## Evaluation Pipeline
-
-The whole evaluation pipeline consists of three methods:
-
-1. `GPT Evaluation`: evaluates model predictions using GPT models.
- - Compare the performance of two different models (battle).
- - Rate the model according to pre-defined metrics using prompting design.
- - Rate the model according to pre-defined metrics with additional reference answer using prompting design.
-2. `Automatic Evaluation`: evaluates model predictions using automatic metrics.
-3. `UniEval`: evaluates model predictions using UniEval models(English only).
-
-### Evaluation Category
-
-Our evaluation pipeline examines the model's capability using 10 categories of questions. The following table introduces each category:
-
-| Evaluation Category | Description |
-| :-----------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Brainstorming | Models are asked to generate a range of creative and diverse ideas according to the question. The capability of creativity is required. |
-| Chat | Models are asked to continue a multi-round dialogue given the roles involved. The capability of understanding, memorizing previous rounds of the dialogue and answering according to the persona provided is required. |
-| Classification | Models are asked to do classification tasks. The capability of accurate classification is required. |
-| Closed QA | Models are asked to answer a closed QA question. The capability of answering questions with limited scope (such as single/multiple choice question) is required. |
-| Extraction | Models are asked to extract information from a given material. The capability of extracting required information is required. |
-| Generation | Models are asked to generate an email, letter, article, etc. The capability of generating texts in a high quality and human-written way is required. |
-| Open QA | Models are asked to answer an open QA question(without context provided). The capability of answering questions with the models' own knowledge base is required. |
-| Roleplay | Models are asked to play the role provided. The capability of engaging in the scenario and effectively interacting with the user is required. |
-| Rewriting | Models are asked to do rewriting tasks such as translation and grammar correction. The capability of rewriting according to different instructions is required. |
-| Summarization | Models are asked to summarize the given paragraph or passage. The capability of summarization is required. |
-
-To better understand each evaluation category, here are some example questions provided.
-
-| Evaluation Category | Chinese Example | English Example |
-| :-----------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Brainstorming | **Example 1:** 请介绍一下人工智能的多个领域。
**Example 2:** 请给出管理家庭财务的 3 个小技巧。 | **Example 1:** How can I improve my memory? Any useful techniques you can suggest?
**Example 2:** What are some ways to increase productivity while working from home? |
-| Chat | **Example 1:** 基于以下角色信息完成一段对话。小张是一名新手爱好者,对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。 小张:您好,老李,我最近开始对养鸡感兴趣了,想请教您一些问题。 老李:你好,小张,我很乐意帮助你。你想问些什么? 小张:我想知道如何确定鸡的品种和性别? 老李:确切的品种可以通过鸡的外貌特征来确定,而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗? 小张:
**Example 2:** 基于以下角色信息完成一段对话。小明是一名医生,一位老年病患者想要停药,但他对病情有所忽视并有担忧;王叔叔是老年病患者的儿子,希望能够听取医生的建议。 小明:你好,王叔叔,我了解你想要让你父亲停药。 王叔叔:是的,我父亲已经吃了那么久的药,我担心药物对他的身体会有副作用。 小明: | **Example 1:** Complete a conversation based on the following character information. Amy is a 30-year-old chef who runs her own restaurant. Jack is a food blogger who specializes in reviewing local restaurants. Amy: Hi Jack, I heard that you're a food blogger. Nice to meet you. Jack: Hi Amy, yes I am. Your restaurant has been receiving a lot of good reviews lately. Amy: Yes, we use only fresh and quality ingredients, and every dish is carefully crafted. Jack:
**Example 2:** Complete a dialogue based on the following role information. A: Elementary student B: Teacher B: Good morning, Student A. Today we're going to learn about addition and subtraction. A: Teacher, I already know this very well. Why do I need to learn it again? B: |
-| Classification | **Example 1:** 新闻标题:今日立夏,有一上联,立夏万物并秀,下联怎么对? 请根据以上新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。
**Example 2:** 新闻标题:赵丽颖很久没有登上微博热搜了,但你们别急,她只是在憋大招而已。 请根据新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。 | **Example 1:** Title: Fighting for Love (2020) Description: Jasmine got obsessed with a man and now he's obsessed with her. Steamy nights, kisses and rules being broken awaits them. She turned his whole world upside down and now he's doing it to hers. In this free fall, can they survive each others love?\" Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\".
**Example2:** Title: Summer Breeze: The Isley Brothers Greatest Hits Live (2005) Description: Filmed in the US in 2005 and captured in excellent form led by Ron Isley's vocals and Ernie Isley's hard edged guitar. Virtually every track is a hit including Shout, Who's That Lady, Twist And Shout, Summer Breeze and Harvest For The World. Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\"." |
-| Closed QA | **Example 1:** 请从以下选项中选择正确答案。以下哪个是世界上最高山峰? A. 长城 B. 泰山 C. 珠穆朗玛峰 D. 黄山
**Example 2:** 请从以下选项中选择一个最佳答案回答下面的问题。问题:非洲最高的山是哪座山? 选项: A. 麦金利山 B. 喜马拉雅山 C. 乞力马扎罗山 | **Example 1:** Which of the following options is NOT a primary color? (a) yellow (b) blue (c) orange (d) red
**Example 2:** Choose the correct option to complete the following sentence: \"Harry Potter and the Chamber of Secrets\" is the **\_\_\_\_** book in the Harry Potter series. (A) first (B) second (C) third (D) fourth |
-| Extraction | **Example 1:** 根据以下新闻文本,提取新闻报道时间,例如回答时按照格式“新闻报道时间:2007 年 8 月 10 日” 新闻文本如下:2007-4-7 中新网 4 月 7 日电据中国消防在线消息,4 月 4 日晚上 7 时 30 分左右,湖南长潭高速公路上发生一起 6 车连环相撞失火事故。长株潭三地消防部门共出动消防车 21 台,警力 100 余人。经过消防官兵近 2 个小时奋力扑救,大火被成功扑灭。据初步调查,有 1 人在此次事故中死亡。
**Example 2:** 根据以下新闻文本,提取新闻报道时间,例如回答时按照格式“新闻报道时间:2007 年 8 月 10 日” 新闻文本如下:2014 年 1 月 15 日,据外媒《俄罗斯报》报道称,位于北半球的澳大利亚现在正处于炎热的夏季,而近日也到了高温酷暑的时候,当地时间 1 月 14 日晚,澳大利亚南部一夜间发生至少 250 起火灾。受炎热天气及雷雨天气影响,澳大利亚南部一夜间发生至少 250 起火灾,灾情多集中在维多利亚州。火灾发生后,救援人员立即展开救灾行动。目前,大部分起火点火势已被控制。 | **Example 1:** Ernest Hemingway, an American literary giant known for his spare and direct writing style, has penned timeless works such as 'The Old Man and the Sea', 'For Whom the Bell Tolls', and 'A Farewell to Arms', which have made a profound impact on the literary world and continue to be widely read and admired today. Extract the name of the author mentioned above.
**Example 2:** In the epic fantasy series 'A Song of Ice and Fire', George R.R. Martin weaves a complex web of political intrigue, war, and magic across the fictional continents of Westeros and Essos. Martin's richly developed characters and intricate plotlines have captivated readers worldwide, much like his other acclaimed works such as 'A Clash of Kings' and 'A Storm of Swords'. Extract the name of the author in the above material. |
-| Generation | **Example 1:** 请撰写一篇文章,介绍如何通过改善生活习惯来预防疾病和延长寿命。
**Example 2:** 请根据以下情节撰写一篇短篇小说:一名年轻人被困在一个荒岛上,他必须想办法生存下去直到被救援。但他很快发现自己并不孤单。 | **Example 1:** Write a descriptive paragraph about an island to relax and unwind, including details about the location and atmosphere.
**Example 2:** Can you help me write a persuasive email to my colleagues encouraging them to participate in a charitable fundraising event? |
-| Open QA | **Example 1:** 请问万有引力定律由谁提出的?
**Example 2:** 哪些国家参与了第一次世界大战? | **Example 1:** What are the four basic tastes of the human palate?
**Example 2:** Who painted the The Scream? |
-| Rewriting | **Example 1:** 请将以下句子改为正确的语序。 生日快乐你祝他了吗?
**Example 2:** 将以下文本翻译成英语: “这个周末我要去海边玩” | **Example 1:** Please translate the following sentences, which are a mixture of Chinese and English, into full English. 我需要买一些 healthy snacks,比如 nuts 和 dried fruits,作为我的 office 的午餐.
**Example 2:** Please rewrite the sentence using an inverted sentence structure. We won't begin our journey until the sun sets. |
-| Roleplay | **Example 1:** 我想让你担任 Android 开发工程师面试官。我将成为候选人,您将向我询问 Android 开发工程师职位的面试问题。我希望你只作为面试官回答。不要一次写出所有的问题。我希望你只对我进行采访。问我问题,等待我的回答。不要写解释。像面试官一样一个一个问我,等我回答。我的第一句话是“面试官你好”。
**Example 2:** 我想让你扮演讲故事的角色。你会想出引人入胜、富有想象力和吸引观众的有趣故事。它可以是童话故事、教育故事或任何其他类型的有潜力的故事以吸引人们的注意力和想象力。根据目标受众,您可以为您的讲故事环节选择特定的主题或主题,例如,如果是儿童,那么您可以谈论动物;如果是成人,那么基于历史的故事可能会更好地吸引他们等。我的第一个请求是我需要一个关于毅力的有趣故事。 | **Example 1:** Assume the role of a marriage counselor. Develop a series of communication exercises for a couple who are experiencing difficulties in their relationship. These exercises should promote active listening, empathy, and effective expression of emotions. Your first assignment is to provide a set of three exercises that focus on resolving conflicts and rebuilding trust.
**Example 2:** I want you to act as a travel agent. I will tell you my desired destination, travel dates, and budget, and it will be your job to suggest the best travel itinerary for me. Your recommendations should include the best transportation options, hotel accommodations, and any popular tourist attractions nearby. My first request is "I want to plan a trip to Tokyo for a week, with a budget of $2000. I want to explore the culture and food of the city." |
-| Summarization | **Example 1:** 请简要总结概括以下段落材料。 当地时间 29 日,泰国卫生部通报,新增 143 名新冠肺炎确诊病例和 1 名死亡病例。截止到当地时间 29 日上午,泰国累计确诊病例 1388 例,其中泰国籍 1172 例,非泰国籍 216 例。死亡病例累计 7 例。(原题为《泰国新增 143 例新冠肺炎确诊病例累计确诊 1388 例》)
**Example 2:** 请简要总结概括以下段落材料。 近期,参与京雄高铁站站房建设的中铁十二局,因在施工过程中存在环境违法行为被雄安新区公开通报。通报发出后,引起社会广泛关注。近日,人民网记者从雄安新区相关部门及中铁十二局获悉,新区有关部门已经集中约谈了中铁十二局等 24 个参与雄安建设的项目单位。对于约谈内容和结果,中铁十二局有关宣传负责人回应:“具体内容不清楚,最好找雄安新区相关部门了解情况。”新区有关部门负责人表示,此前涉及的环境违法行为,中铁十二局已基本整改到位,但约谈内容和结果暂不公开,接下来,将按部就班推进环境治理工作。(原题为《雄安新区:中铁十二局涉环境违法已基本整改到位》) | **Example 1:** The 21 year-old-woman was treated by paramedics after the kitchen fire in Botfield Road in Shifnal, Shropshire. West Mercia Police said it is treating Wednesday morning's incident as arson and are appealing for any witnesses to contact them.The 50-year-old man has been arrested on suspicion of arson with intent to endanger life. For more on this and other stories from Shropshire. Please briefly summarize the above material within 20 words.
**Example 2:** South Wales Police were called to a property in Heolgerrig, Merthyr Tydfil, at about 13:40 BST on Sunday. The child was airlifted to Prince Charles Hospital but died shortly afterwards. Police are investigating the circumstances surrounding the incident and have appealed for witnesses. The girl's family are being supported by specially trained officers. Please briefly summarize the above material within 20 words. |
-
-### Evaluation Metrics
-
-#### GPT Evaluation
-
-GPT evaluation uses GPT models to evaluate the prediction of different models and different pre-defined evaluation metrics are applied to different categories. The following table shows the 11 pre-defined evaluation metrics both in Chinese and English:
-
-| Evaluation Metric | Prompt Words | CoT(Chain-of-Thought) |
-| :----------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| 语言组织 (Language organization) | 语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc. | 1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。 2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说 3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。 4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。 5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。 6. 根据以上因素综合评估答案的语言组织,并给出一个 1 到 5 的分数,其中 5 表示语言组织非常好,而 1 表示语言组织非常差。1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes. 2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory. 3. Determine if the answer is relevant to the question or topic and conveys a clear message. 4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs. 5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information. 6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization. |
-| 切题 (Relevance) | 切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic. | 1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。 2. 阅读答案,确认答案是否直接回答了题目所问的问题。 3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。 4. 根据以上因素综合评估答案的切题程度,并给出一个 1 到 5 的分数,其中 5 表示答案非常切题,而 1 表示答案完全没有切题。1. Read the question to determine what the question asks and what aspects of the question need to be answered. 2. Read the answers to make sure that they directly answer the question asked. 3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc. 4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all. |
-| 创意性 (Creativity) | 创意性(1-5):某些头脑风暴问题可能需要答案具有创意,提出新的思路。Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas. | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则创意性评分可能会受到影响。 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠,但仍然可以被认为是有创意的,只要它提供了新的角度或方法来解决问题。 4. 根据答案的创意性,给出一个 1 到 5 的评分。如果答案缺乏创意,则应给出一个较低的评分。如果答案具有创意并提供了新的思路,应给出一个较高的评分。1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions. 2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected. 3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem. 4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given. |
-| 实用性 (Practicality) | 实用性(1-5):某些头脑风暴问题可能需要答案提出实用的建议或解决方法。Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions. | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则实用性评分可能会受到影响。 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好,但如果无法实现或应用,则实用性评分可能会受到影响。 4. 根据答案的实用性,给出一个 1 到 5 的评分。如果答案缺乏实用性,则应给出一个较低的评分。如果答案提出了实用的建议或解决方法,并且可以很好地解决问题,则应给出一个较高的评分。1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions. 2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected. 3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected. 4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given. |
-| 正确性 (Correctness) | 正确性(1-5):正确性(1-5):答案是否正确。 Correctness (1-5): whether the answer is correct or not. | 1. 仔细阅读题目,尝试自己回答该问题。 2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的,则可以将正确性得分为 5 分。如果答案是部分正确的,则可以给予适当的得分,例如 2 分、3 分或 4 分。如果答案完全不正确,则只得 1 分。
1. Read the question carefully and try to answer the question yourself. 2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded. |
-| 自然 (Naturalness) | 自然(1-5):答案是否自然,并且符合问题给定的身份。Naturalness (1-5): whether the answer is natural and fits the identity given by the question. | 1. 阅读题目,确定题目提供的身份信息。 2. 检查答案内容是否符合题目给定的身份。 3. 根据以上因素,对该回答的自然性进行打分,分数从 1 到 5,其中 1 表示不自然,5 表示非常自然,并符合问题给定的身份。1. Read the question and determine the identity information provided in the question. 2. Check whether the content of the answer matches the identity given in the question. 3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question. |
-| 参与感 (Engagingness) | 参与感(1-5):答案是否对前面的对话内容做出了恰当的反应,是否理解对话的语境和背景。Engagingness (1-5): whether the answer responds appropriately to the content of the preceding conversation and whether it understands the context and background of the conversation. | 1. 阅读题目,确定对话的语境和背景。 2. 检查答案是否充分理解对话的语境和背景,能否自然地融入到对话中而不显得突兀。 3. 根据以上因素,对该回答的参与感进行打分,分数从 1 到 5,其中 1 表示没有参与感,5 表示非常有参与感,并且恰当地理解了对话的语境和背景。1. Read the questions to determine the context and background of the dialogue. 2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt. 3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation. |
-| 合理性 (Reasonableness) | 合理性(1-5):答案是否能够与前面的对话内容形成逻辑上的衔接,是否符合常理,能否在这个上下文中合理存在。Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context. | 1. 阅读题目,确定对话的主题以及问题期望的回答方向。 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接,是否符合常理,能否在这个上下文中合理存在。 3. 根据以上因素,对该回答的合理性进行打分,分数从 1 到 5,其中 1 表示不合理,5 表示非常合理,并且能够与前面的对话内容形成逻辑上的衔接,并符合常理。1. Read the question and determine the topic of the conversation and the direction the question expects the answer to go. 2. Determine whether the answer can be logically connected to the preceding conversation, whether it makes common sense, and whether it can reasonably exist in this context. 3. Based on the above factors, rate the reasonableness of the answer on a scale from 1 to 5, where 1 means unreasonable and 5 means very reasonable and able to form a logical connection with the preceding dialogue content and consistent with common sense. |
-| 多样性 (Diversity) | 多样性(1-5):答案使用语言是否优美,具有有一定的创造性和想象力。然而,回答也应该保持合理和适度,不要过于夸张或离题。Diversity (1-5): Whether the answers use beautiful language and have some creativity and imagination. However, answers should also be kept reasonable and moderate, not overly exaggerated or off-topic. | 1. 仔细阅读整个回答,确保完全理解回答所表达的内容和主题。 2. 在阅读回答的同时,注意语言的质量,例如措辞是否正确,语言是否生动等。 3. 检查回答的创造性和想象力,看看回答是否能够吸引人阅读下去。 4. 检查回答的合理性和适度,看看回答是否夸张或离题。5. 将多样性的评分打分在 1 到 5 之间,5 分表示回答的质量很好,能够吸引人阅读,1 分表示回答的内容生硬或者有离题的问题。1. Read the entire response carefully to ensure that you fully understand the content and theme expressed in the response. 2. While reading the response, pay attention to the quality of the language, such as whether the wording is correct and the language is vivid. 3. Check the creativity and imagination of the response to see if the response is engaging to read on. 4. Check the reasonableness and appropriateness of the responses to see if the responses are exaggerated or off-topic. 5. Rate the diversity on a scale of 1 to 5, with a 5 indicating a good quality response that is engaging to read and a 1 indicating a raw response or a question that is off-topic. |
-| 保真度 (Fidelity) | 保真度(1-5):答案是否能够严格遵守角色的设定回答给定的请求。Fidelity (1-5): whether the answer is able to answer the given request in strict compliance with the role setting. | 1. 仔细阅读问题,了解角色在问题中的设定和表现,包括职业、背景、观点、性格等方面。 阅读题目的请求,确认回答请求时需要注意的细节。 3. 对比提供的回答与该角色的设定,评估回答是否能够严格遵守角色的设定。 4. 结合以上评估结果给出保真度的评分,范围从 1 到 5 分,其中 1 分表示回答与角色设定完全不符,5 分表示回答完全符合角色设定且满足给定请求。1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality. 2. Read the question's request and confirm the details that need to be taken into account when answering the request. 3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role. 4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request. |
-| 简明扼要 (Conciseness) | 简明扼要(1-5):答案是否简明扼要,没有冗余内容。Conciseness (1-5): answers should be concise and without redundant content. | 1. 阅读题目,提取出材料的重点。 2. 阅读该总结,并注意其中的主要观点和信息。 3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息,而不是冗长的段落或文章。 4. 检查总结是否包含与主要观点无关的信息或冗余信息。 5. 确定总结涵盖了材料中的关键信息,并且没有忽略任何重要细节。 6. 给总结打出 1-5 的分数,其中 5 表示总结简明扼要,没有冗余内容,而 1 表示总结冗长或包含不必要的信息,难以理解或记忆。根据您的判断,打出适当的得分。1. Read the title and extract the main points of the material. 2. Read the summary and note the main ideas and messages in it. 3. Assess the length of the summary. A concise summary should usually convey key information within a few sentences or paragraphs, rather than lengthy paragraphs or essays. 4. Check that the summary does not contain information that is not relevant to the main ideas or that is redundant. 5. Make sure that the summary covers the key information in the material and that no important details have been omitted. 6. Rate the summary on a scale of 1-5, where 5 means the summary is concise and free of redundancy, and 1 means the summary is lengthy or contains unnecessary information that is difficult to understand or remember. Based on your judgment, assign the appropriate score. |
-
-GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
-
-> **NOTE 1:** Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "Whether the answer is correct or not."(this is for category `classification`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
-
-> **NOTE 2:** To add customized metrics, you can refer to [FAQ](#faq).
-
-#### Automatic Evaluation
-
-Automated metrics evaluate the capability of a model by comparing model predictions with reference answers.
-There are two ways to obtain reference answers:
-
-- For instruction coming from human-designed problems, the reference answers are generated by GPT-3.5, such as roleplay, chat.
-- For instruction related with classic NLP problems, the reference answers are collected from open-sourced dataset with target answers, such as classification, extraction, summarization.
-
-There are 6 types of automatic evaluation metrics listed in the table below:
-
-| Automatic Evaluation Metric | Description |
-| :---------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| BLEU-n | Measure the accuracy between prediction and reference. BLEU-1 (Unigram) evaluates accuracy in word level. BLEU-n (n-gram) evaluate the fluency in sentence level. |
-| ROUGE | ROUGE-N measures the number of matching n-grams between prediction and reference. ROUGE-L measures the number of matching longest common subsequence (LCS) between prediction and reference. |
-| Distinct | Measure the diversity of generation text by counting the unique n-grams. |
-| BERTScore | Measure the semantic similarity between tokens of predictions and references with BERT. |
-| Precision Recall F1 Score | Measure the number of overlaps between prediction and reference (design for classification and extraction categories). |
-| CHRF | Measure the similarity of character n-grams between prediction and reference. |
-
-#### UniEval Evaluation
-
-UniEval converts all evaluation tasks of different dimensions(metrics) into Boolean QA problems and utilize the model to answer with “Yes” or “No”. Compared with similarity-based metrics such as ROUGE and BLEU, UniEval can achieve a more comprehensive evaluation. In addition, UniEval also demonstrates its ability to transfer to unseen dimensions and tasks.
-
-In our evaluation pipeline, two pre-trained UniEval evaluators are used. One is [unieval-sum](https://huggingface.co/MingZhong/unieval-sum) and the other is [unieval-dialog](https://huggingface.co/MingZhong/unieval-dialog). The two models can be used for the 3 tasks, `summarization`, `dialogue` and `data2text`. Each task has different evaluation dimensions.
-
-| UniEval Model | Task | Dimension(Metric) |
-| :------------: | :------------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| unieval-sum | summarization | coherence: whether the summary is coherent consistency: whether the claim is consistent with the given document fluency: whether the paragraph is fluent relevance: whether the summary is relevant to the reference |
-| unieval-sum | data2text | naturalness: whether the utterance is fluent informativeness: whether the utterance is informative according to the reference |
-| unieval-dialog | dialogue | naturalness: whether the response is natural in the dialogue coherence: whether the response is coherent in the dialogue history understandability: whether the response is understandable in the dialogue |
-
-> **NOTE 1:** Task "data2text" uses the same model as task "summarization".
-
-> **NOTE 2:** In UniEval paper, the `unieval-sum` model demonstrates the best transfer ability and so you can evaluate your customized metric with this model. Details of adding customized metrics can be found in [FAQ](#faq).
-
-> **NOTE 3:** We consider not including all metrics provided in UniEval in our pipeline because the data structure and content of the instructions we want to evaluate are not suitable for direct use of some UniEval metrics.
-
-## Evaluation Process
-
-### Data Format
-
-#### Target Answers / Predictions
-
-A JSON file contains one list. Each element in the list is a target answer / prediction record for one instruction / question.
-An element should have the following fields:
-
-- `category` (str, compulsory): The category of the instruction / question.
-- `instruction` (str, compulsory): The instruction / question for the LLM.
-- `input` (str, optional): The additional context of the instruction / question.
-- `output` (str, optional): The sample output of the instruction (default: GPT-3.5).
-- `target` (str, optional): The target answer for the instruction.
-- `id` (int, compulsory): The ID of the instruction / question.
-
-If the `input` has a target answer, the `output` can be empty. Otherwise, we generate answers from GPT-3.5 as the `output`, and the `target` field is empty.
-
-Example:
-
-```json
-[
- {
- "category": "brainstorming",
- "instruction": "请介绍一下人工智能的多个领域。",
- "input": "",
- "output": "{GPT-3.5 Answers}",
- "target": "",
- "id": 1
- },
- {
- "category": "classification",
- "instruction": "新闻标题:为什么电影《倩女幽魂》中燕赤霞一个道士却拿着金刚经?请根据新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。",
- "input": "",
- "output": "",
- "target": "{target answer}",
- "id": 2
- }
-]
-```
-
-#### Model Answers / Predictions
-
-A JSON file contains one list. Each element in the list is a model answer / prediction record for one instruction / question.
-
-An element should have the following fields:
-
-- `category` (str, compulsory): The category of the instruction / question.
-- `instruction` (str, compulsory): The instruction / question for the LLM.
-- `input` (str, optional): The additional context of the instruction / question.
-- `output` (str, compulsory): The output from the LLM.
-- `target` (str, optional): The target answer for the instruction.
-- `id` (int, compulsory): The ID of the instruction / question.
-
-Example:
-
-```json
-[
- {
- "category": "brainstorming",
- "instruction": "请介绍一下人工智能的多个领域。",
- "input": "",
- "output": "{Model Answers / Predictions}",
- "target": "",
- "id": 1
- },
- {
- "category": "classification",
- "instruction": "新闻标题:为什么电影《倩女幽魂》中燕赤霞一个道士却拿着金刚经?请根据新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。",
- "input": "",
- "output": "{Model Answers / Predictions}",
- "target": "{target answer}",
- "id": 2
- }
-]
-```
-
-### Prompt
-
-#### Battle Prompt
-
-The following is the Chinese battle prompt. In the battle prompt, the question and answers from two different models are fed into the prompt template. You can find example battle prompt files for Chinese and English in `prompt/battle_prompt`.
-
-```json
-{
- "id": 1,
- "system_prompt": "你是一个检查回答质量的好助手。",
- "prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答 案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
- "prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分,分数越高表示整体表现越好。\n请首先输出一行,该行只包含两个数值,分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中,请对你的评价作出全面的解释,避免任何潜在的偏见,并确保AI助手回答的顺序不会影响您的判断。"
-}
-```
-
-#### Evaluation Prompt
-
-The following is an example of a Chinese GPT evaluation prompt. In an evaluation prompt, you should define your metrics in `metrics` and provide CoT(Chain-of-Thought) in `CoT`. You can find example evaluation prompt files for Chinese and English in `prompt/evaluation_prompt`.
-
-```json
-{
- "brainstorming": {
- "id": 1,
- "category": "brainstorming",
- "metrics": {
- "language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。"
- },
- "CoT": {
- "language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。\n\n语言组织:"
- },
- "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
- }
-}
-```
-
-`"metrics"`: the metrics that can be used in GPT evaluation. This field determines which metrics can be added to your config file.
-
-`"CoT"`: evaluation steps you prompt to GPT models for each metric defined in `"metrics"`.
-
-### Evaluation
-
-#### Configuration
-
-The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics, automatic metrics and UniEval metrics in key `GPT`, `Metrics` and `UniEval`(English only). You can find an example English config file in `config`.
-
-```json
-{
- "language": "en",
- "path_for_UniEval": {
- "summarization": "path to unieval-sum model",
- "dialogue": "path to unieval-dialog model",
- "data2text": "path to unieval-sum model"
- },
- "category": {
- "brainstorming": {
- "GPT": ["relevance", "creativity", "practicality", "reasonableness"],
- "Metrics": ["Distinct"],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "chat": {
- "GPT": ["relevance", "naturalness", "engagingness", "reasonableness"],
- "Metrics": ["Distinct"],
- "UniEval": [
- "dialogue-naturalness",
- "dialogue-coherence",
- "dialogue-understandability"
- ]
- }
- }
-}
-```
-
-`"language"`: the language used to evaluate the model capability. We only support Chinese `"cn"` for now.
-
-`"path_for_UniEval"`: path to the UniEval model.
-
-`"category"`: the category/categories needed to evaluate the model capability.
-
-`"GPT"`: the metrics you want to use for GPT evaluation.
-
-`"Metrics"`: the metrics you want to use for automatic metrics evaluation.
-
-`"UniEval"`: the metrics you want to use for UniEval metrics evaluation. The metric has to be in the `"{task}-{metric}"` format because different tasks have same metrics such as naturalness and coherence.
-
-You can remove the key such as `"Metrics"` to skip evaluating answers using its corresponding evaluation metrics.
-
-You can create your config file based on available settings listed in following table.
-
-| "category" | "GPT" | "Metrics" | "UniEval" |
-| :--------------: | :---------------------: | :---------: | :--------------------------: |
-| "brainstorming" | "language organization" | "BLEU" | "dialogue-naturalness" |
-| "chat" | "relevance" | "ROUGE" | "dialogue-coherence" |
-| "classification" | "creativity" | "Distinct" | "dialogue-understandability" |
-| "closed_qa" | "practicality" | "BERTScore" | "data2text-naturalness" |
-| "extraction" | "correctness" | "Precision" | "data2text-informativeness" |
-| "generation" | "naturalness" | "Recall" | "summarization-coherence" |
-| "open_qa" | "engagingness" | "F1 score" | "summarization-consistency" |
-| "rewriting" | "reasonableness" | "CHRF" | "summarization-fluency" |
-| "roleplay" | "diversity" | | "summarization-relevance" |
-| "summarization" | "fidelity" | | |
-| | "conciseness" | | |
-
-> **NOTE:** For categories which don't have standard answers such as `brainstorming`, you should avoid using automatic metrics such as `BLEU` and `ROUGE` which are based on similarity measures and you should use `Distinct` instead in your config file.
-
-#### Evaluate
-
-After setting the configuration file, you can evaluate the model using `eval.py`. If you want to make comparisons between answers of two different models, you should specify two answer files in the argument `answer_file_list` and two model names in the argument `model_name_list`. If you want to evaluate one answer file, the length of both `answer_file_list` and `model_name_list` should be 1 and the program will perform evaluation using automatic metrics and GPT models.
-
-An example script is provided as follows:
-
-```shell
-python eval.py \
- --config_file "path to the config file" \
- --battle_prompt_file "path to the prompt file for battle" \
- --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
- --target_file "path to the target answer file" \
- --answer_file_list "path to the answer files of at most 2 models" \
- --model_name_list "the names of at most 2 models" \
- --gpt_model "which GPT model to use for evaluation" \
- --save_path "path to save results" \
- --openai_key "your openai key" \
-```
-
-If you want GPT evaluation with reference, you can add an argument `--gpt_with_reference`.
-
-## FAQ
-
-How can I add a new GPT evaluation metric?
-
-For example, if you want to add a new metric `persuasiveness` into category `brainstorming`, you should add the metric definition and its corresponding CoT(Chain-of-thought) in the evaluation prompt file in `prompt/evaluation_promt`. The CoT can be generated using ChatGPT. You can prompt ChatGPT to generate evaluation steps for the new metric.
-
-```json
-{
- "brainstorming": {
- "id": 1,
- "category": "brainstorming",
- "metrics": {
- "persuasiveness": "persuasiveness(1-5):a short description for persuasiveness"
- },
- "CoT": {
- "persuasiveness": "CoT for persuasiveness\n\npersuasiveness:"
- },
- "prompt": "You are a good assistant. Please rate the given answer to the \"brainstorming\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
- }
-}
-```
-
-
-
-How can I add a new UniEval evaluation metric?
-
-For example, if you want to add a new metric `persuasiveness` into task `data2text`, you should add a Boolean QA question about the metric in function `add_question` in `unieval/utils.py`. Please do note that how effectively the model would evaluate this metric is unknown, and you may need some experiments to test whether the model is capable of evaluating this metric.
-
-```python
-if task == 'data2text':
- if dimension == 'persuasiveness':
- cur_input = 'question: Is this a persuasive utterence utterance: ' + output[i]
-```
-
-
-
-## To Do
-
-- [x] Add evaluation for English capability
-- [x] Support UniEval
-- [x] Support GPT-4 evaluation
-- [x] Support GPT evaluation with reference
-
-## Citations
-
-```bibtex
-@misc{vicuna2023,
- title = {Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90\%* ChatGPT Quality},
- url = {https://vicuna.lmsys.org},
- author = {Chiang, Wei-Lin and Li, Zhuohan and Lin, Zi and Sheng, Ying and Wu, Zhanghao and Zhang, Hao and Zheng, Lianmin and Zhuang, Siyuan and Zhuang, Yonghao and Gonzalez, Joseph E. and Stoica, Ion and Xing, Eric P.},
- month = {March},
- year = {2023}
-}
-
-@misc{liu2023geval,
- title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
- author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
- year={2023},
- eprint={2303.16634},
- archivePrefix={arXiv},
- primaryClass={cs.CL}
-}
-
-@misc{zhong2022unified,
- title={Towards a Unified Multi-Dimensional Evaluator for Text Generation},
- author={Ming Zhong and Yang Liu and Da Yin and Yuning Mao and Yizhu Jiao and Pengfei Liu and Chenguang Zhu and Heng Ji and Jiawei Han},
- year={2022},
- eprint={2210.07197},
- archivePrefix={arXiv},
- primaryClass={cs.CL}
-}
-```
diff --git a/applications/Chat/evaluate/config/config_cn.json b/applications/Chat/evaluate/config/config_cn.json
deleted file mode 100644
index 023f16bef31c..000000000000
--- a/applications/Chat/evaluate/config/config_cn.json
+++ /dev/null
@@ -1,204 +0,0 @@
-{
- "language": "cn",
- "category": {
- "brainstorming": {
- "GPT": [
- "language organization",
- "relevance",
- "creativity",
- "practicality",
- "reasonableness"
- ],
- "Metrics": [
- "Distinct"
- ]
- },
- "chat": {
- "GPT": [
- "language organization",
- "naturalness",
- "engagingness",
- "fidelity"
- ],
- "Metrics": [
- "Distinct"
- ]
- },
- "classification": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "Precision",
- "Recall",
- "F1 score",
- "CHRF"
- ]
- },
- "closed_qa": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore",
- "CHRF"
- ]
- },
- "extraction": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "Precision",
- "Recall",
- "F1 score",
- "CHRF"
- ]
- },
- "generation": {
- "GPT": [
- "language organization",
- "relevance",
- "diversity"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore"
- ]
- },
- "logical_reasoning": {
- "GPT": [
- "correctness",
- "relevance",
- "reasonableness"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore",
- "CHRF"
- ]
- },
- "open_qa": {
- "GPT": [
- "language organization",
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "Distinct"
- ]
- },
- "rewriting": {
- "GPT": [
- "language organization",
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore"
- ]
- },
- "roleplay": {
- "GPT": [
- "language organization",
- "relevance",
- "fidelity",
- "creativity"
- ],
- "Metrics": [
- "Distinct"
- ]
- },
- "summarization": {
- "GPT": [
- "language organization",
- "relevance",
- "correctness",
- "conciseness"
- ],
- "Metrics": [
- ]
- },
- "Finance": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- },
- "Law": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- },
- "Education": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- },
- "Medical": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- },
- "STEM": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- },
- "SocialScience": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- },
- "Humanity": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- },
- "Other": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- },
- "ethics": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ]
- }
- }
-}
diff --git a/applications/Chat/evaluate/config/config_en.json b/applications/Chat/evaluate/config/config_en.json
deleted file mode 100644
index c964122dd6d6..000000000000
--- a/applications/Chat/evaluate/config/config_en.json
+++ /dev/null
@@ -1,283 +0,0 @@
-{
- "language": "en",
- "path_for_UniEval": {
- "summarization": "path to unieval-sum",
- "dialogue": "path to unieval-dialog",
- "data2text": "path to unieval-sum"
- },
- "category": {
- "brainstorming": {
- "GPT": [
- "language organization",
- "relevance",
- "creativity",
- "practicality",
- "reasonableness"
- ],
- "Metrics": [
- "Distinct"
- ],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "chat": {
- "GPT": [
- "language organization",
- "naturalness",
- "engagingness",
- "fidelity"
- ],
- "Metrics": [
- "Distinct"
- ],
- "UniEval": [
- "summarization-fluency",
- "dialogue-naturalness",
- "dialogue-coherence",
- "dialogue-understandability",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "classification": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "Precision",
- "Recall",
- "F1 score",
- "CHRF"
- ],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "closed_qa": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore",
- "CHRF"
- ],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "extraction": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "Precision",
- "Recall",
- "F1 score",
- "CHRF"
- ],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "generation": {
- "GPT": [
- "language organization",
- "relevance",
- "diversity"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore"
- ],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "logical_reasoning": {
- "GPT": [
- "correctness",
- "relevance",
- "reasonableness"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore",
- "CHRF"
- ],
- "UniEval": [
- ]
- },
- "open_qa": {
- "GPT": [
- "language organization",
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "Distinct"
- ],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "rewriting": {
- "GPT": [
- "language organization",
- "relevance",
- "correctness"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore"
- ],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "roleplay": {
- "GPT": [
- "language organization",
- "relevance",
- "fidelity",
- "creativity"
- ],
- "Metrics": [
- "Distinct"
- ],
- "UniEval": [
- "summarization-fluency",
- "data2text-naturalness",
- "data2text-informativeness"
- ]
- },
- "summarization": {
- "GPT": [
- "language organization",
- "relevance",
- "correctness",
- "conciseness"
- ],
- "Metrics": [
- "BLEU",
- "ROUGE",
- "BERTScore",
- "CHRF"
- ],
- "UniEval": [
- ]
- },
- "Finance": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- },
- "Law": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- },
- "Education": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- },
- "Medical": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- },
- "STEM": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- },
- "SocialScience": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- },
- "Humanity": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- },
- "Other": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- },
- "ethics": {
- "GPT": [
- "relevance",
- "correctness"
- ],
- "Metrics": [
- ],
- "UniEval": [
- ]
- }
- }
-}
diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py
deleted file mode 100644
index e3fe0e9e091b..000000000000
--- a/applications/Chat/evaluate/eval.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import argparse
-import json
-import os
-
-import openai
-from evaluator import Evaluator
-from utils import jload
-
-
-def main(args):
- assert len(args.answer_file_list) == len(
- args.model_name_list), "The number of answer files and model names should be equal!"
-
- # load config
- config = jload(args.config_file)
-
- if config["language"] in ["cn", "en"]:
- # get metric settings for all categories
- metrics_per_category = {}
- for category in config["category"].keys():
- metrics_all = {}
- for metric_type, metrics in config["category"][category].items():
- metrics_all[metric_type] = metrics
- metrics_per_category[category] = metrics_all
-
- battle_prompt = None
- if args.battle_prompt_file:
- battle_prompt = jload(args.battle_prompt_file)
-
- gpt_evaluation_prompt = None
- if args.gpt_evaluation_prompt_file:
- gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
-
- if len(args.model_name_list) == 2 and not battle_prompt:
- raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
-
- if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
- raise Exception(
- "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
-
- if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
- raise Exception(
- "GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
- )
-
- # initialize evaluator
- evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
- config["language"], config.get("path_for_UniEval", None), args.gpt_with_reference)
- if len(args.model_name_list) == 2:
- answers1 = jload(args.answer_file_list[0])
- answers2 = jload(args.answer_file_list[1])
-
- assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
-
- evaluator.battle(answers1=answers1, answers2=answers2)
- evaluator.save(args.save_path, args.model_name_list)
- elif len(args.model_name_list) == 1:
- targets = jload(args.target_file)
- answers = jload(args.answer_file_list[0])
-
- assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"
-
- evaluator.evaluate(answers=answers, targets=targets)
- evaluator.save(args.save_path, args.model_name_list)
- else:
- raise ValueError("Unsupported number of answer files and model names!")
- else:
- raise ValueError(f'Unsupported language {config["language"]}!')
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
- parser.add_argument('--config_file',
- type=str,
- default=None,
- required=True,
- help='path to the file of target results')
- parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
- parser.add_argument('--gpt_evaluation_prompt_file',
- type=str,
- default=None,
- help='path to the prompt file for gpt evaluation')
- parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
- parser.add_argument('--answer_file_list',
- type=str,
- nargs='+',
- default=[],
- required=True,
- help='path to the answer files of at most 2 models')
- parser.add_argument('--model_name_list',
- type=str,
- nargs='+',
- default=[],
- required=True,
- help='the names of at most 2 models')
- parser.add_argument('--gpt_model',
- default="gpt-3.5-turbo",
- choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
- help='which GPT model to use for evaluation')
- parser.add_argument('--gpt_with_reference',
- default=False,
- action="store_true",
- help='whether to include reference answer in gpt evaluation')
- parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
- parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
- args = parser.parse_args()
-
- if args.openai_key is not None:
- os.environ["OPENAI_API_KEY"] = args.openai_key
- openai.api_key = os.getenv("OPENAI_API_KEY")
-
- main(args)
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
deleted file mode 100644
index 3dd5fd6f2f23..000000000000
--- a/applications/Chat/evaluate/evaluator.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import os
-from typing import Any, Dict, List
-
-import gpt_evaluate
-import metrics
-import pandas as pd
-import unieval
-from utils import analyze_automatic_results, get_data_per_category, save_automatic_results
-
-
-class Evaluator(object):
- """
- A class named Evaluator includes GPT-3.5/GPT-4 evaluation
- and automatic evaluation
-
- """
-
- def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
- gpt_model: str, language: str, path_for_UniEval: Dict[str, str], gpt_with_reference: bool) -> None:
- self.params = params
- self.battle_prompt = battle_prompt
- self.gpt_evaluation_prompt = gpt_evaluation_prompt
- self.gpt_model = gpt_model
- self.language = language
- self.path_for_UniEval = path_for_UniEval
- self.gpt_with_reference = gpt_with_reference
- self.automatic_metric_stats = dict()
- self.unieval_metric_stats = dict()
- self.gpt_evaluation_results = dict()
- self.battle_results = []
-
- def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
- """
- Comparison between two models using GPT-4 as the reviewer.
- """
-
- self.battle_results = gpt_evaluate.battle(answers1, answers2, self.battle_prompt)
-
- def evaluate(self, answers: List[Dict], targets: List[Dict]) -> None:
- """
- A comprehensive evaluation of the answers from the model.
- The function evaluates the model's performance from different perspectives
- using GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
-
- The metrics will be decided by the config file.
-
- """
-
- def switch(metric, language):
- if metric == "BLEU":
- return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
- elif metric == "ROUGE":
- return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
- elif metric == "Distinct":
- return metrics.distinct_score(preds=predicts_list, language=language)
- elif metric == "BERTScore":
- return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
- elif metric == "Precision":
- return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
- elif metric == "Recall":
- return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
- elif metric == "F1 score":
- return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
- elif metric == "CHRF":
- return metrics.chrf_score(preds=predicts_list, targets=targets_list, language=language)
- else:
- raise ValueError(f"Unexpected metric")
-
- answers_per_category = get_data_per_category(answers, list(self.params.keys()))
- targets_per_category = get_data_per_category(targets, list(self.params.keys()))
-
- # automatic evaluation
- for category in self.params:
- if len(answers_per_category[category]) == 0:
- print(f"Category {category} specified in your config doesn't have corresponding answers!")
- continue
-
- if self.params[category].get("Metrics", None) is None:
- continue
-
- category_metrics = self.params[category]["Metrics"]
- self.automatic_metric_stats[category] = {}
-
- targets_list = [
- target["target"] if target["target"] else target["output"] for target in targets_per_category[category]
- ]
- predicts_list = [answer["output"] for answer in answers_per_category[category]]
-
- for metric in category_metrics:
- self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))
-
- # UniEval evaluation
- # self.unieval_metric_stats's key is "task" instead of "category".
- # Iterating "task" first will avoid repeated loading models because one task corresponds to one UniEval model.
- # If key is "category", different models will be loaded for multiple times across categories because the user may require different task(models) to evaluate one category.
- for category in self.params:
- if len(answers_per_category[category]) == 0:
- print(f"Category {category} specified in your config doesn't have corresponding answers!")
- continue
-
- if self.params[category].get("UniEval", None) is None:
- continue
-
- if self.params[category]["UniEval"] and self.language == "cn":
- raise Exception(
- "UniEval doesn't support Chinese! Please remove UniEval config in your Chinese config file.")
-
- category_metrics = self.params[category]["UniEval"]
-
- for task, metric in [tuple(category_metric.split("-")) for category_metric in category_metrics]:
- if self.unieval_metric_stats.get(task, None) is None:
- self.unieval_metric_stats[task] = {category: {metric: 0}}
- elif self.unieval_metric_stats[task].get(category, None) is None:
- self.unieval_metric_stats[task][category] = {metric: 0}
- else:
- self.unieval_metric_stats[task][category][metric] = 0
-
- for task in self.unieval_metric_stats:
- if self.path_for_UniEval is None:
- raise Exception(f"Please specify the path for UniEval model in the config file!")
-
- if self.path_for_UniEval.get(task, None) is None:
- raise Exception(f"Please specify the model path for task {task} in the config file!")
-
- print(f"Load UniEval model for task {task}.")
-
- uni_evaluator = unieval.get_evaluator(task, model_name_or_path=self.path_for_UniEval[task])
- for category in self.unieval_metric_stats[task]:
- targets_list = [
- target["target"] if target["target"] else target["output"]
- for target in targets_per_category[category]
- ]
- predicts_list = [answer["output"] for answer in answers_per_category[category]]
- sources_list = [answer["instruction"] + answer["input"] for answer in answers_per_category[category]]
-
- data = unieval.convert_data_to_unieval_format(predicts_list, sources_list, targets_list)
- scores = uni_evaluator.evaluate(data,
- category,
- dims=list(self.unieval_metric_stats[task][category].keys()),
- overall=False)
- avg_scores = unieval.calculate_average_score(scores)
-
- self.unieval_metric_stats[task][category].update(avg_scores)
-
- # gpt evaluation
- for category in self.params:
- if len(answers_per_category[category]) == 0:
- print(f"Category {category} specified in your config doesn't have corresponding answers!")
- continue
-
- if self.params[category].get("GPT", None) is None:
- continue
-
- category_metrics = self.params[category]["GPT"]
-
- prompt = self.gpt_evaluation_prompt.get(category, None)
- if prompt is None:
- print(f"No prompt for category {category}! Use prompt for category general now.")
- prompt = self.gpt_evaluation_prompt["general"]
-
- self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(
- answers_per_category[category],
- prompt,
- category_metrics,
- category,
- self.gpt_model,
- self.language,
- references=targets_per_category[category] if self.gpt_with_reference else None)
-
- def save(self, path: str, model_name_list: List[str]) -> None:
- """
- Save evaluation results of GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
-
- """
-
- if len(model_name_list) == 2:
- save_path = os.path.join(path, "gpt_evaluate", "battle_results")
- gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
- else:
- if self.automatic_metric_stats:
- # Save evaluation results for automatic metrics
- automatic_base_save_path = os.path.join(path, "automatic_results")
- automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
-
- save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
-
- # Save charts and csv.
- automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
- analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
-
- if self.unieval_metric_stats:
- # Save evaluation results for UniEval metrics
- unieval_base_save_path = os.path.join(path, "unieval_results")
- unieval_results_save_path = os.path.join(unieval_base_save_path, "evaluation_results")
-
- unieval.save_unieval_results(model_name_list[0], self.unieval_metric_stats, unieval_results_save_path)
-
- # Save charts and csv.
- unieval_analyses_save_path = os.path.join(unieval_base_save_path, "evaluation_analyses")
- unieval.analyze_unieval_results(unieval_results_save_path, unieval_analyses_save_path)
-
- if self.gpt_evaluation_results:
- # Save evaluation results for GPT evaluation metrics.
- gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
- gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
-
- all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0],
- self.gpt_evaluation_results,
- gpt_evaluation_results_save_path)
-
- # Start to calculate scores and save statistics.
- gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
- gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
- gpt_evaluation_statistics_save_path)
-
- # Save charts and csv.
- gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
- gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
- gpt_evaluation_analyses_save_path)
diff --git a/applications/Chat/evaluate/metrics.py b/applications/Chat/evaluate/metrics.py
deleted file mode 100644
index 77f9b6e98044..000000000000
--- a/applications/Chat/evaluate/metrics.py
+++ /dev/null
@@ -1,253 +0,0 @@
-import statistics
-from typing import Dict, List
-
-import jieba
-from bert_score import score
-from nltk.translate.bleu_score import sentence_bleu
-from nltk.translate.chrf_score import sentence_chrf
-from rouge_chinese import Rouge as Rouge_cn
-from rouge_score import rouge_scorer as Rouge_en
-from sklearn.metrics import f1_score, precision_score, recall_score
-from utils import preprocessing_text, remove_redundant_space
-
-
-def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
- """Calculate BLEU Score Metric
-
- The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
- BLEU-3 for trigram and BLEU-4 for 4-gram. Unigram evaluates the
- accuracy in word level, other n-gram evaluate the fluency in
- sentence level.
- """
- bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
- cumulative_bleu = [0] * 4
- weights = [(1. / 1., 0., 0., 0.), (1. / 2., 1. / 2., 0., 0.), (1. / 3., 1. / 3., 1. / 3., 0.),
- (1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
-
- for pred, target in zip(preds, targets):
- if language == "cn":
- pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
- target_list = [(' '.join(jieba.cut(preprocessing_text(target)))).split()]
- elif language == "en":
- pred_list = preprocessing_text(pred).split()
- target_list = [preprocessing_text(target).split()]
-
- bleu = sentence_bleu(target_list, pred_list, weights=weights)
- cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
-
- for i in range(len(cumulative_bleu)):
- bleu_scores[f"bleu{i+1}"] = cumulative_bleu[i] / len(preds)
-
- return bleu_scores
-
-
-def chrf_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
- """Calculate CHRF Score Metric in sentence level.
- """
- chrf_score = {"chrf": 0}
- cumulative_chrf = []
-
- for pred, target in zip(preds, targets):
- if language == "cn":
- pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
- target_list = ' '.join(jieba.cut(preprocessing_text(target))).split()
- elif language == "en":
- pred_list = preprocessing_text(pred).split()
- target_list = preprocessing_text(target).split()
-
- cumulative_chrf.append(sentence_chrf(target_list, pred_list))
-
- chrf_score["chrf"] = statistics.mean(cumulative_chrf)
-
- return chrf_score
-
-
-def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
- """Calculate Chinese ROUGE Score Metric
-
- The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
- and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
- the preds and targets. ROUGE-L measures the number of matching
- longest common subsequence (LCS) between preds and targets.
- """
- rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
- all_preds = []
- all_targets = []
-
- for pred, target in zip(preds, targets):
- pred_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(pred))))
- target_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(target))))
- all_preds.append(pred_list)
- all_targets.append(target_list)
-
- rouge_cn = Rouge_cn()
- rouge_avg = rouge_cn.get_scores(all_preds, all_targets, avg=True)
-
- rouge_scores["rouge1"] = rouge_avg["rouge-1"]["f"]
- rouge_scores["rouge2"] = rouge_avg["rouge-2"]["f"]
- rouge_scores["rougeL"] = rouge_avg["rouge-l"]["f"]
-
- return rouge_scores
-
-
-def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
- """Calculate English ROUGE Score Metric
-
- The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
- and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
- the preds and targets. ROUGE-L measures the number of matching
- longest common subsequence (LCS) between preds and targets.
- """
- rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
- all_preds = []
- all_targets = []
-
- rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
-
- for pred, target in zip(preds, targets):
- score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
- rouge_scores["rouge1"] += score['rouge1'].fmeasure
- rouge_scores["rouge2"] += score['rouge2'].fmeasure
- rouge_scores["rougeL"] += score['rougeL'].fmeasure
-
- rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
- rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
- rouge_scores["rougeL"] = rouge_scores["rougeL"] / len(preds)
-
- return rouge_scores
-
-
-def rouge_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
- """Calculate ROUGE Score Metric"""
- if language == "cn":
- return rouge_cn_score(preds, targets)
- elif language == "en":
- return rouge_en_score(preds, targets)
-
-
-def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
- """Calculate Distinct Score Metric
-
- This metric refers to https://arxiv.org/abs/1510.03055.
- It evaluates the diversity of generation text by counting
- the unique n-grams.
- """
- distinct_score = {"distinct": 0}
- cumulative_distinct = []
-
- for pred in preds:
- if language == "cn":
- pred_seg_list = ' '.join(jieba.cut(pred)).split()
- count_segs = len(pred_seg_list)
- unique_segs = set(pred_seg_list)
- count_unique_chars = len(unique_segs)
- # prevent denominator from being 0
- cumulative_distinct.append(count_unique_chars / (count_segs + 1e-6))
- elif language == "en":
- # calculate distinct 1-gram, 2-gram, 3-gram
- unique_ngram = [set() for _ in range(0, 3)]
- all_ngram_count = [0 for _ in range(0, 3)]
-
- split_pred = preprocessing_text(pred).split()
- for n in range(0, 3):
- for i in range(0, len(split_pred) - n):
- ngram = ' '.join(split_pred[i:i + n + 1])
- unique_ngram[n].add(ngram)
- all_ngram_count[n] += 1
-
- # Sometimes the answer may contain only one word. For 2-gram and 3-gram, the gram count(denominator) may be zero.
- avg_distinct = [len(a) / (b + 1e-6) for a, b in zip(unique_ngram, all_ngram_count)]
-
- cumulative_distinct.append(statistics.mean(avg_distinct))
-
- distinct_score["distinct"] = statistics.mean(cumulative_distinct)
-
- return distinct_score
-
-
-def bert_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
- """Calculate BERTScore Metric
-
- The BERTScore evaluates the semantic similarity between
- tokens of preds and targets with BERT.
- """
- bert_score = {"bert_score": 0}
- pred_list = []
- target_list = []
-
- for pred, target in zip(preds, targets):
- pred_list.append(pred)
- target_list.append(target)
-
- if language == "cn":
- _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
- elif language == "en":
- _, _, F = score(pred_list, target_list, lang="en", verbose=True)
-
- bert_score["bert_score"] = F.mean().item()
-
- return bert_score
-
-
-def calculate_precision_recall_f1(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
- """Precision, Recall and F1-Score Calculation
-
- The calculation of precision, recall and f1-score is realized by counting
- the number f overlaps between the preds and target. The comparison length
- limited by the shorter one of preds and targets.
- """
- precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
- precision_scores = []
- recall_scores = []
- f1_scores = []
-
- for pred, target in zip(preds, targets):
- if language == "cn":
- pred_list = [char for char in ' '.join(jieba.cut(preprocessing_text(pred))).split()]
- target_list = [char for char in ' '.join(jieba.cut(preprocessing_text(target))).split()]
- elif language == "en":
- pred_list = [char for char in preprocessing_text(pred).split()]
- target_list = [char for char in preprocessing_text(target).split()]
-
- target_labels = [1] * min(len(target_list), len(pred_list))
- pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
-
- precision_scores.append(precision_score(target_labels, pred_labels, zero_division=0))
- recall_scores.append(recall_score(target_labels, pred_labels, zero_division=0))
- f1_scores.append(f1_score(target_labels, pred_labels, zero_division=0))
-
- precision_recall_f1["precision"] = statistics.mean(precision_scores)
- precision_recall_f1["recall"] = statistics.mean(recall_scores)
- precision_recall_f1["f1_score"] = statistics.mean(f1_scores)
-
- return precision_recall_f1
-
-
-def precision(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
- """Calculate Precision Metric
-
- Calculating precision by counting the number of overlaps between the preds and target.
- """
- precision = {"precision": 0}
- precision["precision"] = calculate_precision_recall_f1(preds, targets, language)["precision"]
- return precision
-
-
-def recall(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
- """Calculate Recall Metric
-
- Calculating recall by counting the number of overlaps between the preds and target.
- """
- recall = {"recall": 0}
- recall["recall"] = calculate_precision_recall_f1(preds, targets, language)["recall"]
- return recall
-
-
-def F1_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
- """Calculate F1-score Metric
-
- Calculating f1-score by counting the number of overlaps between the preds and target.
- """
- f1 = {"f1_score": 0}
- f1["f1_score"] = calculate_precision_recall_f1(preds, targets, language)["f1_score"]
- return f1
diff --git a/applications/Chat/evaluate/requirements.txt b/applications/Chat/evaluate/requirements.txt
deleted file mode 100644
index 27d317ed88cc..000000000000
--- a/applications/Chat/evaluate/requirements.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-jieba
-bert-score
-rouge_chinese
-scikit-metrics
-nltk
-openai
-seaborn
-pandas
-matplotlib
-numpy
-zhon
-rouge_score
diff --git a/applications/Chat/evaluate/unieval/__init__.py b/applications/Chat/evaluate/unieval/__init__.py
deleted file mode 100644
index dad8d6ad09fa..000000000000
--- a/applications/Chat/evaluate/unieval/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .evaluator import get_evaluator
-from .utils import (
- analyze_unieval_results,
- calculate_average_score,
- convert_data_to_unieval_format,
- save_unieval_results,
-)
-
-__all__ = [
- 'get_evaluator', 'convert_data_to_unieval_format', 'calculate_average_score', 'save_unieval_results',
- 'analyze_unieval_results'
-]
diff --git a/applications/Chat/evaluate/unieval/evaluator.py b/applications/Chat/evaluate/unieval/evaluator.py
deleted file mode 100644
index 56cc6d2f9e41..000000000000
--- a/applications/Chat/evaluate/unieval/evaluator.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Ming Zhong
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import numpy as np
-from nltk import sent_tokenize
-
-from .scorer import UniEvaluator
-from .utils import add_question
-
-
-class SumEvaluator:
-
- def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
- """ Set up evaluator for text summarization """
- self.scorer = UniEvaluator(
- model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
- max_length=max_length,
- device=device,
- cache_dir=cache_dir)
- self.task = 'summarization'
- self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance']
-
- def evaluate(self, data, category, dims=None, overall=True):
- """
- Get the scores of all the given dimensions
-
- category: The category to be evaluated.
-
- dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
- four dimensions: coherence, consistency, fluency, relevance.
-
- overall: indicates whether the overall score is to be calculated.
- Overall score can be customized to a combination of scores based on different
- dimensions. The default here is the average score of all the given dimensions.
- """
- n_data = len(data)
- eval_scores = [{} for _ in range(n_data)]
-
- if dims == None:
- eval_dims = self.dimensions
- else:
- assert isinstance(dims, list)
- eval_dims = dims
-
- for dim in eval_dims:
- # Calculate average sentence-level scores for 'consistency' and 'fluency'
- if dim == 'consistency' or dim == 'fluency':
- src_list, output_list = [], []
- n_sents = [] # the number of sentences in each generated summary
- for i in range(n_data):
- source = data[i]['source']
- system_outputs = sent_tokenize(data[i]['system_output'])
- n_sents.append(len(system_outputs))
- for j in range(len(system_outputs)):
- src_list.append(source)
- output_list.append(system_outputs[j])
- input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task)
- sent_score = self.scorer.score(input_list, self.task, category, dim)
-
- # Get average score for each sample
- start_idx = 0
- score = []
- for cur_n_sent in n_sents:
- # prevent denominator from being 0
- score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
- start_idx += cur_n_sent
-
- # Calculate summary-level score for 'coherence' and 'relevance'
- elif dim == 'coherence' or dim == 'relevance':
- src_list, output_list, ref_list = [], [], []
- for i in range(n_data):
- src_list.append(data[i]['source'])
- output_list.append(data[i]['system_output'])
- if dim == 'relevance':
- ref_list.append(data[i]['reference'])
- input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
- score = self.scorer.score(input_list, self.task, category, dim)
-
- # Please customize other dimensions here for summarization
- else:
- raise NotImplementedError('The input format for this dimension is still undefined. \
- Please customize it first.')
-
- for i in range(n_data):
- eval_scores[i][dim] = score[i]
-
- # Customize your overall score here.
- if overall == True:
- for i in range(n_data):
- eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
-
- return eval_scores
-
-
-class DialogEvaluator:
-
- def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
- """ Set up evaluator for dialogues """
- self.scorer = UniEvaluator(
- model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path,
- max_length=max_length,
- device=device,
- cache_dir=cache_dir)
- self.task = 'dialogue'
- self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability']
-
- def evaluate(self, data, category, dims=None, overall=True):
- """
- Get the scores of all the given dimensions
-
- category: The category to be evaluated.
-
- dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
- five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
-
- overall: indicates whether the overall score is to be calculated.
- Overall score can be customized to a combination of scores based on different
- dimensions. The default here is the average score of all the given dimensions.
- """
- n_data = len(data)
- eval_scores = [{} for _ in range(n_data)]
-
- if dims == None:
- eval_dims = self.dimensions
- else:
- assert isinstance(dims, list)
- eval_dims = dims
-
- for dim in eval_dims:
- # Calculate summation score for 'engagingness'
- if dim == 'engagingness':
- src_list, output_list, context_list = [], [], []
- n_sents = [] # the number of sentences in each generated response
- for i in range(n_data):
- source = data[i]['source']
- context = data[i]['context']
- system_outputs = sent_tokenize(data[i]['system_output'])
- n_sents.append(len(system_outputs))
- for j in range(len(system_outputs)):
- src_list.append(source)
- context_list.append(context)
- output_list.append(system_outputs[j])
- input_list = add_question(dimension=dim,
- output=output_list,
- src=src_list,
- context=context_list,
- task=self.task)
- sent_score = self.scorer.score(input_list, self.task, category, dim)
-
- # Get the summation score for each sample
- start_idx = 0
- score = []
- for cur_n_sent in n_sents:
- score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]))
- start_idx += cur_n_sent
-
- # Calculate turn-level score for other dimensions
- elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']:
- src_list, output_list, context_list = [], [], []
- for i in range(n_data):
- src_list.append(data[i]['source'])
- output_list.append(data[i]['system_output'])
- context_list.append(data[i]['context'])
- input_list = add_question(dimension=dim,
- output=output_list,
- src=src_list,
- context=context_list,
- task=self.task)
- score = self.scorer.score(input_list, self.task, category, dim)
-
- # Please customize other dimensions here for summarization
- else:
- raise NotImplementedError('The input format for this dimension is still undefined. \
- Please customize it first.')
-
- for i in range(n_data):
- eval_scores[i][dim] = score[i]
-
- # Customize your overall score here.
- if overall == True:
- for i in range(n_data):
- eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
-
- return eval_scores
-
-
-class D2tEvaluator:
-
- def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
- """ Set up evaluator for data-to-text """
- self.scorer = UniEvaluator(
- model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
- max_length=max_length,
- device=device,
- cache_dir=cache_dir)
- self.task = 'data2text'
- self.dimensions = ['naturalness', 'informativeness']
-
- def evaluate(self, data, category, dims=None, overall=True):
- """
- Get the scores of all the given dimensions
-
- category: The category to be evaluated.
-
- dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
- two dimensions: naturalness and informativeness.
-
- overall: indicates whether the overall score is to be calculated.
- Overall score can be customized to a combination of scores based on different
- dimensions. The default here is the average score of all the given dimensions.
- """
- n_data = len(data)
- eval_scores = [{} for _ in range(n_data)]
-
- if dims == None:
- eval_dims = self.dimensions
- else:
- assert isinstance(dims, list)
- eval_dims = dims
-
- for dim in eval_dims:
- output_list, ref_list = [], []
- for i in range(n_data):
- output_list.append(data[i]['system_output'])
- ref_list.append(data[i]['reference'])
-
- input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
- score = self.scorer.score(input_list, self.task, category, dim)
-
- for i in range(n_data):
- eval_scores[i][dim] = score[i]
-
- # Customize your overall score here.
- if overall == True:
- for i in range(n_data):
- eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
-
- return eval_scores
-
-
-class FactEvaluator:
-
- def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
- """ Set up evaluator for factual consistency detection """
- self.scorer = UniEvaluator(
- model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path,
- max_length=max_length,
- device=device,
- cache_dir=cache_dir)
- self.task = 'fact'
- self.dim = 'consistency'
-
- def evaluate(self, data, category):
- """
- Get the factual consistency score (only 1 dimension for this task)
-
- category: The category to be evaluated.
- """
- n_data = len(data)
- eval_scores = [{} for _ in range(n_data)]
-
- # Calculate average sentence-level scores for factual consistency
- src_list, output_list = [], []
- n_sents = [] # the number of sentences in the claim
- for i in range(n_data):
- source = data[i]['source']
- system_outputs = sent_tokenize(data[i]['system_output'])
- n_sents.append(len(system_outputs))
- for j in range(len(system_outputs)):
- src_list.append(source)
- output_list.append(system_outputs[j])
- input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
- sent_score = self.scorer.score(input_list, self.task, category, self.dim)
-
- # Get average score for each sample
- start_idx = 0
- score = []
- for cur_n_sent in n_sents:
- score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
- start_idx += cur_n_sent
-
- for i in range(n_data):
- eval_scores[i][self.dim] = score[i]
-
- return eval_scores
-
-
-def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None):
- assert task in ['summarization', 'dialogue', 'data2text', 'fact']
- if task == 'summarization':
- return SumEvaluator(model_name_or_path=model_name_or_path,
- max_length=max_length,
- device=device,
- cache_dir=cache_dir)
- elif task == 'dialogue':
- return DialogEvaluator(model_name_or_path=model_name_or_path,
- max_length=max_length,
- device=device,
- cache_dir=cache_dir)
- elif task == 'data2text':
- return D2tEvaluator(model_name_or_path=model_name_or_path,
- max_length=max_length,
- device=device,
- cache_dir=cache_dir)
- elif task == 'fact':
- return FactEvaluator(model_name_or_path=model_name_or_path,
- max_length=max_length,
- device=device,
- cache_dir=cache_dir)
- else:
- raise NotImplementedError('Other tasks are not implemented, \
- please customize specific tasks here.')
diff --git a/applications/Chat/evaluate/unieval/scorer.py b/applications/Chat/evaluate/unieval/scorer.py
deleted file mode 100644
index 2c70bb9f6ded..000000000000
--- a/applications/Chat/evaluate/unieval/scorer.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Ming Zhong
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import torch
-import torch.nn as nn
-from tqdm import tqdm
-from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
-
-
-class UniEvaluator:
-
- def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
- """ Set up model """
- self.device = device
- self.max_length = max_length
-
- self.config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
- self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
- self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir)
-
- self.model.eval()
- self.model.to(device)
-
- self.softmax = nn.Softmax(dim=1)
-
- self.pos_id = self.tokenizer("Yes")["input_ids"][0]
- self.neg_id = self.tokenizer("No")["input_ids"][0]
-
- def score(self, inputs, task, category, dim, batch_size=8):
- """
- Get scores for the given samples.
- final_score = postive_score / (postive_score + negative_score)
- """
-
- # The implementation of "forward" in T5 still requires decoder_input_ids.
- # Therefore, we construct a random one-word target sequence.
- # The content of the target has no effect on the final scores.
- tgts = ["No" for _ in range(len(inputs))]
-
- pos_score_list, neg_score_list = [], []
- for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "):
- src_list = inputs[i:i + batch_size]
- tgt_list = tgts[i:i + batch_size]
- try:
- with torch.no_grad():
- encoded_src = self.tokenizer(src_list,
- max_length=self.max_length,
- truncation=True,
- padding=True,
- return_tensors='pt')
- encoded_tgt = self.tokenizer(tgt_list,
- max_length=self.max_length,
- truncation=True,
- padding=True,
- return_tensors='pt')
-
- src_tokens = encoded_src['input_ids'].to(self.device)
- src_mask = encoded_src['attention_mask'].to(self.device)
-
- tgt_tokens = encoded_tgt['input_ids'].to(self.device)[:, 0].unsqueeze(-1)
-
- output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens)
- logits = output.logits.view(-1, self.model.config.vocab_size)
-
- pos_score = self.softmax(logits)[:, self.pos_id] # Yes
- neg_score = self.softmax(logits)[:, self.neg_id] # No
-
- cur_pos_score = [x.item() for x in pos_score]
- cur_neg_score = [x.item() for x in neg_score]
- pos_score_list += cur_pos_score
- neg_score_list += cur_neg_score
-
- except RuntimeError:
- print(f'source: {src_list}')
- print(f'target: {tgt_list}')
- exit(0)
-
- score_list = []
- for i in range(len(pos_score_list)):
- score_list.append(pos_score_list[i] / (pos_score_list[i] + neg_score_list[i]))
-
- return score_list
diff --git a/applications/Chat/evaluate/unieval/utils.py b/applications/Chat/evaluate/unieval/utils.py
deleted file mode 100644
index a381e9e590b2..000000000000
--- a/applications/Chat/evaluate/unieval/utils.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# MIT License
-
-# Copyright (c) 2022 Ming Zhong
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import os
-from typing import Dict
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import tqdm
-
-
-def add_question(dimension, output, src=None, ref=None, context=None, task=None):
- """
- Add questions to generate input in Bool-QA format for UniEval.
-
- dimension: specific dimension to be evaluated
- src: source input for different NLG tasks. For example, source document for summarization
- and dialogue history for dialogue response generation.
- output: output text generated by the models
- ref: human-annotated groundtruth
- context: the context needed to evaluate several specific dimension. For example,
- additional factual information when evaluating engagingness and groundedness in dialogues.
- """
-
- input_with_question = []
- for i in range(len(output)):
- # For summarization
- if task == 'summarization':
- if dimension == 'fluency':
- cur_input = 'question: Is this a fluent paragraph? paragraph: ' + output[i]
- elif dimension == 'coherence':
- cur_input = 'question: Is this a coherent summary to the document? summary: ' + output[
- i] + ' document: ' + src[i]
- elif dimension == 'consistency':
- cur_input = 'question: Is this claim consistent with the document? claim: ' + output[
- i] + ' document: ' + src[i]
- elif dimension == 'relevance':
- cur_input = 'question: Is this summary relevant to the reference? summary: ' + output[
- i] + ' reference: ' + ref[i]
- else:
- raise NotImplementedError(
- 'The input format for this dimension is still undefined. Please customize it first.')
- # For dialogues
- elif task == 'dialogue':
- if dimension == 'naturalness':
- cur_input = 'question: Is this a natural response in the dialogue? response: ' + output[i]
- elif dimension == 'coherence':
- cur_input = 'question: Is this a coherent response given the dialogue history? response: '\
- + output[i] + ' dialogue history: ' + src[i]
- elif dimension == 'engagingness':
- cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? response: '\
- + output[i] + ' dialogue history: ' + src[i] + ' fact: ' + context[i]
- elif dimension == 'groundedness':
- cur_input = 'question: Is this response consistent with knowledge in the fact? response: '\
- + output[i] + ' fact: ' + context[i]
- elif dimension == 'understandability':
- cur_input = 'question: Is this an understandable response in the dialogue? response: ' + output[i]
- else:
- raise NotImplementedError(
- 'The input format for this dimension is still undefined. Please customize it first.')
- # For data-to-text
- elif task == 'data2text':
- if dimension == 'naturalness':
- cur_input = 'question: Is this a fluent utterance? utterance: ' + output[i]
- elif dimension == 'informativeness':
- cur_input = 'question: Is this sentence informative according to the reference? sentence: '\
- + output[i] + ' reference: ' + ref[i]
- else:
- raise NotImplementedError(
- 'The input format for this dimension is still undefined. Please customize it first.')
- # For factual consistency detection
- elif task == 'fact':
- if dimension == 'consistency':
- cur_input = 'question: Is this claim consistent with the document? claim: ' + output[
- i] + ' document: ' + src[i]
- else:
- raise NotImplementedError('No other dimensions for the factual consistency detection task.')
- # For new customized tasks
- else:
- raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.')
- input_with_question.append(cur_input)
- return input_with_question
-
-
-def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None):
- """
- Convert the data into the unieval's format.
-
- output_list: a list of model output
-
- src_list: source input for different NLG tasks. For example, source document for summarization
- and dialogue history for dialogue response generation
- ref_list: human-annotated groundtruth
- """
- json_data = []
- for i in range(len(output_list)):
- cur = {}
- cur['system_output'] = output_list[i]
- if src_list is not None:
- cur['source'] = src_list[i]
- if ref_list is not None:
- cur['reference'] = ref_list[i]
- cur['context'] = ""
- json_data.append(cur)
- return json_data
-
-
-def calculate_average_score(scores):
- """
- Calculate average scores for different metrics
-
- scores: a list of scores for different metrics for each answer
-
- """
- metrics = {metric: 0 for metric in scores[0]}
-
- for score in scores:
- for metric in score:
- metrics[metric] += score[metric]
-
- for metric in metrics:
- metrics[metric] /= len(scores)
-
- return metrics
-
-
-def save_unieval_results(model_name: str, unieval_metric_stats: Dict[str, Dict], save_path: str) -> None:
- """
- Save UniEval evaluation results of different categories for one model.
-
- """
-
- if not os.path.exists(save_path):
- os.makedirs(save_path)
-
- unieval_metric_stats_per_category = {}
- for task, category_stat in unieval_metric_stats.items():
- for category, metric_stat in category_stat.items():
- if unieval_metric_stats_per_category.get(category, None) is None:
- unieval_metric_stats_per_category[category] = {}
- for metric, score in metric_stat.items():
- unieval_metric_stats_per_category[category][f"{metric}-{task}"] = score
-
- automatic_df = pd.DataFrame(unieval_metric_stats_per_category)
- automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
-
-
-def read_unieval_results(results_path: str, file_name: str) -> Dict[str, Dict]:
- """
- Read a csv file and return a dictionary which stores scores per metric.
-
- """
-
- results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
-
- results_dict = {metric: {} for metric in list(results.index)}
- for i, metric in enumerate(results_dict.keys()):
- for j, category in enumerate(list(results.columns)):
- if pd.isnull(results.iloc[i][j]):
- continue
- results_dict[metric][category] = results.iloc[i][j]
-
- return results_dict
-
-
-def analyze_unieval_results(results_path: str, save_path: str) -> None:
- """
- Analyze and visualize all csv files in the given folder.
-
- """
-
- if not os.path.exists(results_path):
- raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
-
- all_statistics = {}
-
- for file_name in os.listdir(results_path):
- if file_name.endswith("_results.csv"):
- model_name = file_name.split("_results.csv")[0]
- all_statistics[model_name] = read_unieval_results(results_path, file_name)
-
- if len(list(all_statistics.keys())) == 0:
- raise Exception(f'There are no csv files in the given directory "{results_path}"!')
-
- frame_all = {"model": [], "category": [], "metric": [], "score": []}
- frame_per_metric = {}
- for model_name, model_statistics in all_statistics.items():
- for metric, metric_statistics in model_statistics.items():
- if frame_per_metric.get(metric) is None:
- frame_per_metric[metric] = {"model": [], "category": [], "score": []}
-
- for category, category_score in metric_statistics.items():
- frame_all["model"].append(model_name)
- frame_all["category"].append(category)
- frame_all["metric"].append(metric)
- frame_all["score"].append(category_score)
-
- frame_per_metric[metric]["model"].append(model_name)
- frame_per_metric[metric]["category"].append(category)
- frame_per_metric[metric]["score"].append(category_score)
-
- if not os.path.exists(save_path):
- os.makedirs(save_path)
-
- frame_all = pd.DataFrame(frame_all)
- frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv"))
-
- for metric in tqdm.tqdm(
- frame_per_metric.keys(),
- desc=f"UniEval metrics: ",
- total=len(frame_per_metric.keys()),
- ):
- data = pd.DataFrame(frame_per_metric[metric])
-
- sns.set()
- fig = plt.figure(figsize=(16, 10))
-
- fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
- fig.set_title(
- f"Comparison between Different Models for Metric {metric.split('-')[0].title()} in Task {metric.split('-')[1].title()}"
- )
- plt.xlabel("Evaluation Category")
- plt.ylabel("Score")
-
- figure = fig.get_figure()
- figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
-
- plt.close()
diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py
deleted file mode 100644
index 406e43db99aa..000000000000
--- a/applications/Chat/evaluate/utils.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import io
-import json
-import os
-import re
-import string
-from typing import Dict
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import tqdm
-from zhon import hanzi
-
-
-def _make_w_io_base(f, mode: str):
- if not isinstance(f, io.IOBase):
- f_dirname = os.path.dirname(f)
- if f_dirname != "":
- os.makedirs(f_dirname, exist_ok=True)
- f = open(f, mode=mode)
- return f
-
-
-def _make_r_io_base(f, mode: str):
- if not isinstance(f, io.IOBase):
- f = open(f, mode=mode)
- return f
-
-
-def jdump(obj, f, mode="w", indent=4, default=str):
- """Dump a str or dictionary to a file in json format.
- Args:
- obj: An object to be written.
- f: A string path to the location on disk.
- mode: Mode for opening the file.
- indent: Indent for storing json dictionaries.
- default: A function to handle non-serializable entries; defaults to `str`.
- """
- f = _make_w_io_base(f, mode)
- if isinstance(obj, (dict, list)):
- json.dump(obj, f, indent=indent, default=default, ensure_ascii=False)
- elif isinstance(obj, str):
- f.write(obj)
- else:
- raise ValueError(f"Unexpected type: {type(obj)}")
- f.close()
-
-
-def jload(f, mode="r"):
- """Load a .json file into a dictionary."""
- f = _make_r_io_base(f, mode)
- jdict = json.load(f)
- f.close()
- return jdict
-
-
-def get_json_list(file_path):
- with open(file_path, 'r') as f:
- json_list = []
- for line in f:
- json_list.append(json.loads(line))
- return json_list
-
-
-def get_data_per_category(data, categories):
- data_per_category = {category: [] for category in categories}
- for item in data:
- category = item["category"]
- if category in categories:
- data_per_category[category].append(item)
-
- return data_per_category
-
-
-def remove_punctuations(text: str) -> str:
- """
- Remove punctuations in the given text.
- It is used in evaluation of automatic metrics.
-
- """
-
- punctuation = string.punctuation + hanzi.punctuation
- punctuation = set([char for char in punctuation])
- punctuation.difference_update(set("!@#$%&()<>?|,.\"'"))
-
- out = []
- for char in text:
- if char in punctuation:
- continue
- else:
- out.append(char)
-
- return "".join(out)
-
-
-def remove_redundant_space(text: str) -> str:
- """
- Remove redundant spaces in the given text.
- It is used in evaluation of automatic metrics.
-
- """
-
- return " ".join(text.split())
-
-
-def preprocessing_text(text: str) -> str:
- """
- Preprocess the given text.
- It is used in evaluation of automatic metrics.
-
- """
-
- return remove_redundant_space(remove_punctuations(text.lower()))
-
-
-def save_automatic_results(model_name: str, automatic_metric_stats: Dict[str, Dict], save_path: str) -> None:
- """
- Save automatic evaluation results of different categories for one model.
-
- """
-
- if not os.path.exists(save_path):
- os.makedirs(save_path)
-
- automatic_df = pd.DataFrame(automatic_metric_stats)
- automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
-
-
-def read_automatic_results(results_path: str, file_name: str) -> Dict[str, Dict]:
- """
- Read a csv file and return a dictionary which stores scores per metric.
-
- """
-
- results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
-
- results_dict = {metric: {} for metric in list(results.index)}
- for i, metric in enumerate(results_dict.keys()):
- for j, category in enumerate(list(results.columns)):
- if pd.isnull(results.iloc[i][j]):
- continue
- results_dict[metric][category] = results.iloc[i][j]
-
- return results_dict
-
-
-def analyze_automatic_results(results_path: str, save_path: str) -> None:
- """
- Analyze and visualize all csv files in the given folder.
-
- """
-
- if not os.path.exists(results_path):
- raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
-
- all_statistics = {}
-
- for file_name in os.listdir(results_path):
- if file_name.endswith("_results.csv"):
- model_name = file_name.split("_results.csv")[0]
- all_statistics[model_name] = read_automatic_results(results_path, file_name)
-
- if len(list(all_statistics.keys())) == 0:
- raise Exception(f'There are no csv files in the given directory "{results_path}"!')
-
- frame_all = {"model": [], "category": [], "metric": [], "score": []}
- frame_per_metric = {}
- for model_name, model_statistics in all_statistics.items():
- for metric, metric_statistics in model_statistics.items():
- if frame_per_metric.get(metric) is None:
- frame_per_metric[metric] = {"model": [], "category": [], "score": []}
-
- for category, category_score in metric_statistics.items():
- frame_all["model"].append(model_name)
- frame_all["category"].append(category)
- frame_all["metric"].append(metric)
- frame_all["score"].append(category_score)
-
- frame_per_metric[metric]["model"].append(model_name)
- frame_per_metric[metric]["category"].append(category)
- frame_per_metric[metric]["score"].append(category_score)
-
- if not os.path.exists(save_path):
- os.makedirs(save_path)
-
- frame_all = pd.DataFrame(frame_all)
- frame_all.to_csv(os.path.join(save_path, "automatic_evaluation_statistics.csv"))
-
- for metric in tqdm.tqdm(
- frame_per_metric.keys(),
- desc=f"automatic metrics: ",
- total=len(frame_per_metric.keys()),
- ):
- data = pd.DataFrame(frame_per_metric[metric])
-
- sns.set()
- fig = plt.figure(figsize=(16, 10))
-
- fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
- fig.set_title(f"Comparison between Different Models for Metric {metric.title()}")
- plt.xlabel("Evaluation Category")
- plt.ylabel("Score")
-
- figure = fig.get_figure()
- figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
-
- plt.close()
diff --git a/applications/Chat/examples/community/peft/easy_dataset.py b/applications/Chat/examples/community/peft/easy_dataset.py
index 2fe293957079..d4b17689e9cb 100644
--- a/applications/Chat/examples/community/peft/easy_dataset.py
+++ b/applications/Chat/examples/community/peft/easy_dataset.py
@@ -3,7 +3,6 @@
from typing import Dict, Sequence
import torch
-from datasets import load_dataset
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoTokenizer
@@ -20,7 +19,8 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: AutoTokenizer, max_length: i
padding="longest",
max_length=max_length,
truncation=True,
- ) for text in strings
+ )
+ for text in strings
]
input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
input_ids_lens = labels_lens = [
@@ -48,18 +48,17 @@ def preprocess(sources: Sequence[str], targets: Sequence[str], tokenizer: AutoTo
class EasySupervisedDataset(Dataset):
-
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None:
super(EasySupervisedDataset, self).__init__()
with open(data_file, "r", encoding="UTF-8") as f:
all_lines = f.readlines()
- #split to source and target ,source the characters before "回答:" including "回答:", target the characters after "回答:"
+ # split to source and target ,source the characters before "回答:" including "回答:", target the characters after "回答:"
sources, targets = [], []
for line in all_lines:
if "回答:" in line:
sep_index = line.index("回答:")
- sources.append(line[:sep_index + 3])
- targets.append(line[sep_index + 3:] + tokenizer.eos_token)
+ sources.append(line[: sep_index + 3])
+ targets.append(line[sep_index + 3 :] + tokenizer.eos_token)
else:
sources.append(line)
targets.append("" + tokenizer.eos_token)
@@ -83,15 +82,17 @@ def __str__(self):
class EasyPromptsDataset(Dataset):
-
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None:
super(EasyPromptsDataset, self).__init__()
with open(data_file, "r", encoding="UTF-8") as f:
all_lines = f.readlines()
- all_lines = [line if "回答:" not in line else line[:line.index("回答:") + 3] for line in all_lines]
+ all_lines = [line if "回答:" not in line else line[: line.index("回答:") + 3] for line in all_lines]
self.prompts = [
- tokenizer(line, return_tensors='pt', max_length=max_length, padding='max_length',
- truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
+ tokenizer(line, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True)[
+ "input_ids"
+ ]
+ .to(torch.cuda.current_device())
+ .squeeze(0)
for line in tqdm(all_lines)
]
self.data_file = data_file
@@ -110,7 +111,6 @@ def __str__(self):
class EasyRewardDataset(Dataset):
-
def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
super(EasyRewardDataset, self).__init__()
self.chosen = []
@@ -120,44 +120,42 @@ def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None
else:
self.end_token = special_token
print(self.end_token)
- #read all lines in the train_file to a list
+ # read all lines in the train_file to a list
with open(train_file, "r", encoding="UTF-8") as f:
all_lines = f.readlines()
for line in tqdm(all_lines):
data = json.loads(line)
- prompt = "提问:" + data['prompt'] + " 回答:"
-
- chosen = prompt + data['chosen'] + self.end_token
- chosen_token = tokenizer(chosen,
- max_length=max_length,
- padding="max_length",
- truncation=True,
- return_tensors="pt")
- self.chosen.append({
- "input_ids": chosen_token['input_ids'],
- "attention_mask": chosen_token['attention_mask']
- })
-
- reject = prompt + data['rejected'] + self.end_token
- reject_token = tokenizer(reject,
- max_length=max_length,
- padding="max_length",
- truncation=True,
- return_tensors="pt")
- self.reject.append({
- "input_ids": reject_token['input_ids'],
- "attention_mask": reject_token['attention_mask']
- })
+ prompt = "提问:" + data["prompt"] + " 回答:"
+
+ chosen = prompt + data["chosen"] + self.end_token
+ chosen_token = tokenizer(
+ chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+ self.chosen.append(
+ {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
+ )
+
+ reject = prompt + data["rejected"] + self.end_token
+ reject_token = tokenizer(
+ reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+ self.reject.append(
+ {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
+ )
def __len__(self):
length = len(self.chosen)
return length
def __getitem__(self, idx):
- return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
- "input_ids"], self.reject[idx]["attention_mask"]
-
- #python representation of the object and the string representation of the object
+ return (
+ self.chosen[idx]["input_ids"],
+ self.chosen[idx]["attention_mask"],
+ self.reject[idx]["input_ids"],
+ self.reject[idx]["attention_mask"],
+ )
+
+ # python representation of the object and the string representation of the object
def __repr__(self):
return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
@@ -165,26 +163,25 @@ def __str__(self):
return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
-'''
+"""
Easy SFT just accept a text file which can be read line by line. However the datasets will group texts together to max_length so LLM will learn the texts meaning better.
If individual lines are not related, just set is_group_texts to False.
-'''
+"""
class EasySFTDataset(Dataset):
-
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
super().__init__()
- #read the data_file line by line
+ # read the data_file line by line
with open(data_file, "r", encoding="UTF-8") as f:
- #encode the text data line by line and put raw python list input_ids only to raw_input_ids list
+ # encode the text data line by line and put raw python list input_ids only to raw_input_ids list
raw_input_ids = []
for line in f:
encoded_ids = tokenizer.encode(line)
- #if the encoded_ids is longer than max_length, then split it into several parts
+ # if the encoded_ids is longer than max_length, then split it into several parts
if len(encoded_ids) > max_length:
for i in range(0, len(encoded_ids), max_length):
- raw_input_ids.append(encoded_ids[i:i + max_length])
+ raw_input_ids.append(encoded_ids[i : i + max_length])
else:
raw_input_ids.append(encoded_ids)
@@ -196,12 +193,13 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
if is_group_texts:
for input_ids in raw_input_ids:
if len(current_input_ids) + len(input_ids) > max_length:
- #pad the current_input_ids to max_length with tokenizer.pad_token_id
+ # pad the current_input_ids to max_length with tokenizer.pad_token_id
padded_length = max_length - len(current_input_ids)
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(
- torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
+ torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
+ )
current_input_ids = []
else:
current_input_ids.extend(input_ids)
@@ -210,14 +208,16 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(
- torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
+ torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
+ )
else:
- #just append the raw_input_ids to max_length
+ # just append the raw_input_ids to max_length
for input_ids in raw_input_ids:
padded_length = max_length - len(input_ids)
input_ids.extend([tokenizer.pad_token_id] * padded_length)
attention_mask.append(
- torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
+ torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long)
+ )
grouped_input_ids.append(torch.tensor(input_ids, dtype=torch.long))
self.input_ids = grouped_input_ids
self.labels = copy.deepcopy(self.input_ids)
@@ -227,14 +227,14 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
def __len__(self):
return len(self.input_ids)
- #get item from dataset
+ # get item from dataset
def __getitem__(self, idx):
return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
- #generate the dataset description to be printed by print in python
+ # generate the dataset description to be printed by print in python
def __repr__(self):
return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
- #generate the dataset description to be printed by print in python
+ # generate the dataset description to be printed by print in python
def __str__(self):
return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
diff --git a/applications/Chat/examples/community/peft/easy_models.py b/applications/Chat/examples/community/peft/easy_models.py
index fe294868159d..db629e50ed94 100644
--- a/applications/Chat/examples/community/peft/easy_models.py
+++ b/applications/Chat/examples/community/peft/easy_models.py
@@ -4,7 +4,7 @@
import torch.nn as nn
import torch.nn.functional as F
from coati.models.generation import generate
-from coati.models.utils import log_probs_from_logits, masked_mean
+from coati.models.utils import log_probs_from_logits
from peft import PeftModel
from torch.nn.modules import Module
from transformers import BloomConfig, BloomForCausalLM
@@ -24,38 +24,33 @@ def __init__(self, model: nn.Module) -> None:
@torch.no_grad()
def generate(
- self,
- input_ids: torch.Tensor,
- return_action_mask: bool = True,
- **kwargs
+ self, input_ids: torch.Tensor, return_action_mask: bool = True, **kwargs
) -> Union[Tuple[torch.LongTensor, torch.LongTensor], Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]:
sequences = generate(self.model, input_ids, **kwargs)
attention_mask = None
- pad_token_id = kwargs.get('pad_token_id', None)
+ pad_token_id = kwargs.get("pad_token_id", None)
if pad_token_id is not None:
attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
if not return_action_mask:
return sequences, attention_mask, None
input_len = input_ids.size(1)
- eos_token_id = kwargs.get('eos_token_id', None)
+ eos_token_id = kwargs.get("eos_token_id", None)
if eos_token_id is None:
action_mask = torch.ones_like(sequences, dtype=torch.bool)
else:
# left padding may be applied, only mask action
action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
- action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input
+ action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input
action_mask[:, :input_len] = False
action_mask = action_mask[:, 1:]
- return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len):]
+ return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len) :]
- def forward(self,
- sequences: torch.LongTensor,
- num_actions: int,
- attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
- """Returns action log probs
- """
+ def forward(
+ self, sequences: torch.LongTensor, num_actions: int, attention_mask: Optional[torch.Tensor] = None
+ ) -> torch.Tensor:
+ """Returns action log probs"""
output = self.model(sequences, attention_mask=attention_mask)
- logits = output['logits']
+ logits = output["logits"]
log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
return log_probs[:, -num_actions:]
@@ -75,11 +70,13 @@ class BLOOMActor(Actor):
lora_train_bias (str): LoRA bias training mode.
"""
- def __init__(self,
- pretrained: str = None,
- config: Optional[BloomConfig] = None,
- checkpoint: bool = False,
- lora_path: str = None) -> None:
+ def __init__(
+ self,
+ pretrained: str = None,
+ config: Optional[BloomConfig] = None,
+ checkpoint: bool = False,
+ lora_path: str = None,
+ ) -> None:
if pretrained is not None:
model = BloomForCausalLM.from_pretrained(pretrained)
elif config is not None:
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index 9385e457d852..1dd9ffcdf1cd 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -1,18 +1,16 @@
import argparse
-import pandas as pd
import torch
import torch.distributed as dist
-from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
+from coati.dataset import DataCollatorForSupervisedDataset
from coati.models.bloom import BLOOMRM, BLOOMCritic
-from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
-from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.models.gpt import GPTRM, GPTCritic
+from coati.models.llama import LlamaCritic, LlamaRM
+from coati.models.opt import OPTRM, OPTCritic
from coati.trainer import PPOTrainer
from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
from easy_models import BLOOMActor
-from peft import PeftModel
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
@@ -23,24 +21,24 @@
def main(args):
# configure strategy
- if args.strategy == 'ddp':
+ if args.strategy == "ddp":
strategy = DDPStrategy()
- elif args.strategy == 'colossalai_gemini':
- strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5)
- elif args.strategy == 'colossalai_zero2':
- strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
+ elif args.strategy == "colossalai_gemini":
+ strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+ elif args.strategy == "colossalai_zero2":
+ strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
if args.rm_path is not None:
- state_dict = torch.load(args.rm_path, map_location='cpu')
+ state_dict = torch.load(args.rm_path, map_location="cpu")
# configure model
- if args.model == 'bloom':
+ if args.model == "bloom":
# initial_model = BLOOMActor(pretrained=args.pretrain)
- print('Using peft lora to load Bloom model as initial_model')
+ print("Using peft lora to load Bloom model as initial_model")
initial_model = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
- print('Using peft lora to load Bloom model as initial_model (Done)')
+ print("Using peft lora to load Bloom model as initial_model (Done)")
else:
raise ValueError(f'Unsupported actor model "{args.model}"')
@@ -49,59 +47,59 @@ def main(args):
else:
rm_model_name = args.rm_model
- if rm_model_name == 'gpt2':
+ if rm_model_name == "gpt2":
reward_model = GPTRM(pretrained=args.rm_pretrain)
- elif rm_model_name == 'bloom':
+ elif rm_model_name == "bloom":
print("load bloom reward model ", args.rm_pretrain)
reward_model = BLOOMRM(pretrained=args.rm_pretrain)
- elif rm_model_name == 'opt':
+ elif rm_model_name == "opt":
reward_model = OPTRM(pretrained=args.rm_pretrain)
- elif rm_model_name == 'llama':
+ elif rm_model_name == "llama":
reward_model = LlamaRM(pretrained=args.rm_pretrain)
else:
raise ValueError(f'Unsupported reward model "{rm_model_name}"')
if args.rm_path is not None:
- print('Loading reward model from', args.rm_path)
+ print("Loading reward model from", args.rm_path)
reward_model.load_state_dict(state_dict)
- if args.strategy != 'colossalai_gemini':
+ if args.strategy != "colossalai_gemini":
initial_model.to(torch.float16).to(torch.cuda.current_device())
reward_model.to(torch.float16).to(torch.cuda.current_device())
with strategy.model_init_context():
- if args.model == 'bloom':
+ if args.model == "bloom":
# actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
- print('Using peft lora to load Bloom model as Actor')
+ print("Using peft lora to load Bloom model as Actor")
actor = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
- print('Using peft lora to load Bloom model as Actor (Done)')
+ print("Using peft lora to load Bloom model as Actor (Done)")
else:
raise ValueError(f'Unsupported actor model "{args.model}"')
- if rm_model_name == 'gpt2':
+ if rm_model_name == "gpt2":
critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
- elif rm_model_name == 'bloom':
+ elif rm_model_name == "bloom":
print("load bloom critic ", args.rm_pretrain, " lora_rank ", args.lora_rank, " use_action_mask ", True)
critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
print("load bloom critic (Done) ")
- elif rm_model_name == 'opt':
+ elif rm_model_name == "opt":
critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
- elif rm_model_name == 'llama':
+ elif rm_model_name == "llama":
critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
else:
raise ValueError(f'Unsupported reward model "{rm_model_name}"')
if args.rm_path is not None:
- print('Loading reward model from', args.rm_path)
+ print("Loading reward model from", args.rm_path)
critic.load_state_dict(state_dict)
del state_dict
- if args.strategy != 'colossalai_gemini':
+ if args.strategy != "colossalai_gemini":
critic.to(torch.float16).to(torch.cuda.current_device())
actor.to(torch.float16).to(torch.cuda.current_device())
# configure optimizer
- if args.strategy.startswith('colossalai'):
+ if args.strategy.startswith("colossalai"):
actor_optim = HybridAdam(actor.parameters(), lr=1e-7)
critic_optim = HybridAdam(critic.parameters(), lr=1e-7)
else:
@@ -109,18 +107,18 @@ def main(args):
critic_optim = Adam(critic.parameters(), lr=1e-7)
# configure tokenizer
- if args.model == 'gpt2':
+ if args.model == "gpt2":
tokenizer = GPT2Tokenizer.from_pretrained(args.rm_pretrain)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'bloom':
+ elif args.model == "bloom":
tokenizer = BloomTokenizerFast.from_pretrained(args.rm_pretrain)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'opt':
+ elif args.model == "opt":
tokenizer = AutoTokenizer.from_pretrained(args.rm_pretrain)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'llama':
+ elif args.model == "llama":
tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
- tokenizer.eos_token = '<\s>'
+ tokenizer.eos_token = ""
tokenizer.pad_token = tokenizer.unk_token
else:
raise ValueError(f'Unsupported model "{args.model}"')
@@ -132,26 +130,27 @@ def main(args):
prompt_sampler = DistributedSampler(prompt_dataset, shuffle=True, seed=42, drop_last=True)
else:
prompt_sampler = None
- prompt_dataloader = DataLoader(prompt_dataset,
- shuffle=(prompt_sampler is None),
- sampler=prompt_sampler,
- batch_size=args.train_batch_size)
+ prompt_dataloader = DataLoader(
+ prompt_dataset, shuffle=(prompt_sampler is None), sampler=prompt_sampler, batch_size=args.train_batch_size
+ )
pretrain_dataset = EasySupervisedDataset(args.pretrain_dataset, tokenizer)
if dist.is_initialized() and dist.get_world_size() > 1:
pretrain_sampler = DistributedSampler(pretrain_dataset, shuffle=True, seed=42, drop_last=True)
else:
pretrain_sampler = None
- pretrain_dataloader = DataLoader(pretrain_dataset,
- shuffle=(pretrain_sampler is None),
- sampler=pretrain_sampler,
- batch_size=args.ptx_batch_size,
- collate_fn=data_collator)
+ pretrain_dataloader = DataLoader(
+ pretrain_dataset,
+ shuffle=(pretrain_sampler is None),
+ sampler=pretrain_sampler,
+ batch_size=args.ptx_batch_size,
+ collate_fn=data_collator,
+ )
def tokenize_fn(texts):
# MUST padding to max length to ensure inputs of all ranks have the same length
# Different length may lead to hang when using gemini, as different generation steps
- batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+ batch = tokenizer(texts, return_tensors="pt", max_length=96, padding="max_length", truncation=True)
return {k: v.to(torch.cuda.current_device()) for k, v in batch.items()}
(actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
@@ -178,45 +177,46 @@ def tokenize_fn(texts):
eos_token_id=tokenizer.eos_token_id,
)
- trainer.fit(prompt_dataloader=prompt_dataloader,
- pretrain_dataloader=pretrain_dataloader,
- num_episodes=args.num_episodes,
- num_update_steps=args.num_update_steps,
- num_collect_steps=args.num_collect_steps)
+ trainer.fit(
+ prompt_dataloader=prompt_dataloader,
+ pretrain_dataloader=pretrain_dataloader,
+ num_episodes=args.num_episodes,
+ num_update_steps=args.num_update_steps,
+ num_collect_steps=args.num_collect_steps,
+ )
# save model checkpoint after fitting
trainer.save_model(args.save_path, only_rank0=True, tokenizer=tokenizer)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
- strategy.save_optimizer(actor_optim,
- 'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
- only_rank0=False)
+ strategy.save_optimizer(
+ actor_optim, "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()), only_rank0=False
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--prompt_path', type=str, default=None, help='path to the prompt dataset')
- parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset')
- parser.add_argument('--strategy',
- choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
- default='ddp',
- help='strategy to use')
- parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--sft_lora_path', type=str, default=None)
- parser.add_argument('--rm_model', default=None, choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--rm_path', type=str, default=None)
- parser.add_argument('--rm_pretrain', type=str, default=None)
- parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts')
- parser.add_argument('--need_optim_ckpt', type=bool, default=False)
- parser.add_argument('--num_episodes', type=int, default=10)
- parser.add_argument('--num_collect_steps', type=int, default=10)
- parser.add_argument('--num_update_steps', type=int, default=5)
- parser.add_argument('--train_batch_size', type=int, default=2)
- parser.add_argument('--ptx_batch_size', type=int, default=1)
- parser.add_argument('--experience_batch_size', type=int, default=8)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
- parser.add_argument('--kl_coef', type=float, default=0.1)
- parser.add_argument('--ptx_coef', type=float, default=0.9)
+ parser.add_argument("--prompt_path", type=str, default=None, help="path to the prompt dataset")
+ parser.add_argument("--pretrain_dataset", type=str, default=None, help="path to the pretrained dataset")
+ parser.add_argument(
+ "--strategy", choices=["ddp", "colossalai_gemini", "colossalai_zero2"], default="ddp", help="strategy to use"
+ )
+ parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--sft_lora_path", type=str, default=None)
+ parser.add_argument("--rm_model", default=None, choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--rm_path", type=str, default=None)
+ parser.add_argument("--rm_pretrain", type=str, default=None)
+ parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
+ parser.add_argument("--need_optim_ckpt", type=bool, default=False)
+ parser.add_argument("--num_episodes", type=int, default=10)
+ parser.add_argument("--num_collect_steps", type=int, default=10)
+ parser.add_argument("--num_update_steps", type=int, default=5)
+ parser.add_argument("--train_batch_size", type=int, default=2)
+ parser.add_argument("--ptx_batch_size", type=int, default=1)
+ parser.add_argument("--experience_batch_size", type=int, default=8)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+ parser.add_argument("--kl_coef", type=float, default=0.1)
+ parser.add_argument("--ptx_coef", type=float, default=0.9)
args = parser.parse_args()
main(args)
diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
index 4af08e6d0141..6d395deadd0e 100644
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -1,18 +1,10 @@
import argparse
import os
-import loralib as lora
import torch
import torch.distributed as dist
-from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
-from coati.models.base import RewardModel
-from coati.models.bloom import BLOOMLM
-from coati.models.gpt import GPTLM
-from coati.models.llama import LlamaLM
-from coati.models.opt import OPTLM
from coati.trainer import SFTTrainer
from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from datasets import load_dataset
from easy_dataset import EasyDataset
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
from torch.optim import Adam
@@ -29,75 +21,76 @@
def train(args):
# configure strategy
- if args.strategy == 'ddp':
+ if args.strategy == "ddp":
strategy = DDPStrategy()
- elif args.strategy == 'colossalai_gemini':
- strategy = GeminiStrategy(placement_policy='cuda')
- elif args.strategy == 'colossalai_zero2':
- strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
+ elif args.strategy == "colossalai_gemini":
+ strategy = GeminiStrategy(placement_policy="static")
+ elif args.strategy == "colossalai_zero2":
+ strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
# configure model
with strategy.model_init_context():
- print('Warning: currently only bloom is tested, gpt2,llama and opt are not tested')
+ print("Warning: currently only bloom is tested, gpt2,llama and opt are not tested")
model = AutoModelForCausalLM.from_pretrained(args.pretrain).to(torch.cuda.current_device())
# if the args.save_path exists and args.save_path+'/adapter_config.json' exists, we'll load the adapter_config.json
- if os.path.exists(args.save_path) and os.path.exists(args.save_path + '/adapter_config.json') \
- and os.path.exists(args.save_path + '/adapter_model.bin'):
+ if (
+ os.path.exists(args.save_path)
+ and os.path.exists(args.save_path + "/adapter_config.json")
+ and os.path.exists(args.save_path + "/adapter_model.bin")
+ ):
print("loading from saved peft model ", args.save_path)
model = PeftModel.from_pretrained(model, args.save_path)
else:
# we'll use peft lora library to do the lora
lora_rank = args.lora_rank if args.lora_rank > 0 else 32
# config lora with rank of lora_rank
- lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
- inference_mode=False,
- r=lora_rank,
- lora_alpha=32,
- lora_dropout=0.1)
+ lora_config = LoraConfig(
+ task_type=TaskType.CAUSAL_LM, inference_mode=False, r=lora_rank, lora_alpha=32, lora_dropout=0.1
+ )
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# configure tokenizer
- if args.model == 'gpt2':
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+ if args.model == "gpt2":
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'bloom':
+ elif args.model == "bloom":
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'opt':
+ elif args.model == "opt":
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'llama':
+ elif args.model == "llama":
tokenizer = AutoTokenizer.from_pretrained(
args.pretrain,
padding_side="right",
use_fast=False,
)
- tokenizer.eos_token = '<\s>'
+ tokenizer.eos_token = ""
tokenizer.pad_token = tokenizer.unk_token
else:
raise ValueError(f'Unsupported model "{args.model}"')
- if args.model == 'llama' and args.strategy == 'colossalai_gemini':
+ if args.model == "llama" and args.strategy == "colossalai_gemini":
# this is a hack to deal with the resized embedding
# to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
for name, param in model.named_parameters():
if not isinstance(param, ColoParameter):
- sub_module_name = '.'.join(name.split('.')[:-1])
- weight_name = name.split('.')[-1]
+ sub_module_name = ".".join(name.split(".")[:-1])
+ weight_name = name.split(".")[-1]
sub_module = model.get_submodule(sub_module_name)
setattr(sub_module, weight_name, ColoParameter(param))
# configure optimizer
- if args.strategy.startswith('colossalai'):
+ if args.strategy.startswith("colossalai"):
optim = HybridAdam(model.parameters(), lr=args.lr, clipping_norm=1.0)
else:
optim = Adam(model.parameters(), lr=args.lr)
logger = get_dist_logger()
- logger.set_level('WARNING')
+ logger.set_level("WARNING")
# configure dataset
law_dataset = EasyDataset(args.dataset, tokenizer=tokenizer, is_group_texts=not args.is_short_text)
@@ -108,47 +101,57 @@ def train(args):
eval_dataset = EasyDataset(args.eval_dataset, tokenizer=tokenizer, is_group_texts=not args.is_short_text)
data_collator = default_collate
if dist.is_initialized() and dist.get_world_size() > 1:
- train_sampler = DistributedSampler(train_dataset,
- shuffle=True,
- seed=42,
- drop_last=True,
- rank=dist.get_rank(),
- num_replicas=dist.get_world_size())
+ train_sampler = DistributedSampler(
+ train_dataset,
+ shuffle=True,
+ seed=42,
+ drop_last=True,
+ rank=dist.get_rank(),
+ num_replicas=dist.get_world_size(),
+ )
if eval_dataset is not None:
- eval_sampler = DistributedSampler(eval_dataset,
- shuffle=False,
- seed=42,
- drop_last=False,
- rank=dist.get_rank(),
- num_replicas=dist.get_world_size())
+ eval_sampler = DistributedSampler(
+ eval_dataset,
+ shuffle=False,
+ seed=42,
+ drop_last=False,
+ rank=dist.get_rank(),
+ num_replicas=dist.get_world_size(),
+ )
else:
train_sampler = None
eval_sampler = None
- train_dataloader = DataLoader(train_dataset,
- shuffle=(train_sampler is None),
- sampler=train_sampler,
- batch_size=args.batch_size,
- collate_fn=data_collator,
- pin_memory=True)
+ train_dataloader = DataLoader(
+ train_dataset,
+ shuffle=(train_sampler is None),
+ sampler=train_sampler,
+ batch_size=args.batch_size,
+ collate_fn=data_collator,
+ pin_memory=True,
+ )
if eval_dataset is not None:
- eval_dataloader = DataLoader(eval_dataset,
- shuffle=(eval_sampler is None),
- sampler=eval_sampler,
- batch_size=args.batch_size,
- collate_fn=data_collator,
- pin_memory=True)
+ eval_dataloader = DataLoader(
+ eval_dataset,
+ shuffle=(eval_sampler is None),
+ sampler=eval_sampler,
+ batch_size=args.batch_size,
+ collate_fn=data_collator,
+ pin_memory=True,
+ )
else:
eval_dataloader = None
- trainer = SFTTrainer(model=model,
- strategy=strategy,
- optim=optim,
- train_dataloader=train_dataloader,
- eval_dataloader=eval_dataloader,
- batch_size=args.batch_size,
- max_epochs=args.max_epochs,
- accumulation_steps=args.accumulation_steps)
+ trainer = SFTTrainer(
+ model=model,
+ strategy=strategy,
+ optim=optim,
+ train_dataloader=train_dataloader,
+ eval_dataloader=eval_dataloader,
+ batch_size=args.batch_size,
+ max_epochs=args.max_epochs,
+ accumulation_steps=args.accumulation_steps,
+ )
trainer.fit(logger=logger, log_interval=args.log_interval)
@@ -156,29 +159,27 @@ def train(args):
trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
- strategy.save_optimizer(trainer.optimizer,
- 'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()),
- only_rank0=False)
+ strategy.save_optimizer(
+ trainer.optimizer, "rm_optim_checkpoint_%d.pt" % (torch.cuda.current_device()), only_rank0=False
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--strategy',
- choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
- default='ddp')
- parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--dataset', type=str, default=None)
- parser.add_argument('--eval_dataset', type=str, default=None)
- parser.add_argument('--save_path', type=str, default='output')
- parser.add_argument('--need_optim_ckpt', type=bool, default=False)
- parser.add_argument('--max_epochs', type=int, default=3)
- parser.add_argument('--batch_size', type=int, default=4)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
- parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
- parser.add_argument('--lr', type=float, default=5e-6)
- parser.add_argument('--accumulation_steps', type=int, default=8)
- parser.add_argument('--enable_peft_lora', action='store_true', default=False)
- parser.add_argument("--is_short_text", action='store_true', default=False)
+ parser.add_argument("--strategy", choices=["ddp", "colossalai_gemini", "colossalai_zero2"], default="ddp")
+ parser.add_argument("--model", choices=["gpt2", "bloom", "opt", "llama"], default="bloom")
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--dataset", type=str, default=None)
+ parser.add_argument("--eval_dataset", type=str, default=None)
+ parser.add_argument("--save_path", type=str, default="output")
+ parser.add_argument("--need_optim_ckpt", type=bool, default=False)
+ parser.add_argument("--max_epochs", type=int, default=3)
+ parser.add_argument("--batch_size", type=int, default=4)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+ parser.add_argument("--log_interval", type=int, default=100, help="how many steps to log")
+ parser.add_argument("--lr", type=float, default=5e-6)
+ parser.add_argument("--accumulation_steps", type=int, default=8)
+ parser.add_argument("--enable_peft_lora", action="store_true", default=False)
+ parser.add_argument("--is_short_text", action="store_true", default=False)
args = parser.parse_args()
train(args)
diff --git a/applications/Chat/examples/community/ray/ray_job_script.py b/applications/Chat/examples/community/ray/ray_job_script.py
index 53f304d379fe..e8a1175a9c32 100644
--- a/applications/Chat/examples/community/ray/ray_job_script.py
+++ b/applications/Chat/examples/community/ray/ray_job_script.py
@@ -6,16 +6,25 @@
def main(api_server_endpoint="http://127.0.0.1:8265"):
client = JobSubmissionClient(api_server_endpoint)
client.submit_job(
- entrypoint=
- "python experimental/ray/train_prompts_on_ray.py --strategy colossalai_zero2 --prompt_csv_url https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/prompts.csv",
+ entrypoint="python experimental/ray/train_prompts_on_ray.py --strategy colossalai_zero2 --prompt_csv_url https://huggingface.co/datasets/fka/awesome-chatgpt-prompts/resolve/main/prompts.csv",
runtime_env={
- "working_dir":
- "applications/Chat",
+ "working_dir": "applications/Chat",
"pip": [
- "torch==1.13.1", "transformers>=4.20.1", "datasets", "loralib", "colossalai>=0.2.4", "langchain",
- "tokenizers", "fastapi", "sse_starlette", "wandb", "sentencepiece", "gpustat"
- ]
- })
+ "torch==1.13.1",
+ "transformers>=4.20.1",
+ "datasets",
+ "loralib",
+ "colossalai>=0.2.4",
+ "langchain",
+ "tokenizers",
+ "fastapi",
+ "sse_starlette",
+ "wandb",
+ "sentencepiece",
+ "gpustat",
+ ],
+ },
+ )
if __name__ == "__main__":
diff --git a/applications/Chat/examples/community/ray/train_prompts_on_ray.py b/applications/Chat/examples/community/ray/train_prompts_on_ray.py
index 1bba9ad66fbc..8abd83a8b249 100644
--- a/applications/Chat/examples/community/ray/train_prompts_on_ray.py
+++ b/applications/Chat/examples/community/ray/train_prompts_on_ray.py
@@ -26,9 +26,14 @@
class ExperienceCompositionRefs:
-
- def __init__(self, sequences_attention_mask_action_mask_ref: ray.ObjectRef, action_log_probs_ref: ray.ObjectRef,
- base_action_log_probs_ref: ray.ObjectRef, value_ref: ray.ObjectRef, r_ref: ray.ObjectRef) -> None:
+ def __init__(
+ self,
+ sequences_attention_mask_action_mask_ref: ray.ObjectRef,
+ action_log_probs_ref: ray.ObjectRef,
+ base_action_log_probs_ref: ray.ObjectRef,
+ value_ref: ray.ObjectRef,
+ r_ref: ray.ObjectRef,
+ ) -> None:
self.sequences_attention_mask_action_mask_ref = sequences_attention_mask_action_mask_ref
self.action_log_probs_ref = action_log_probs_ref
self.base_action_log_probs_ref = base_action_log_probs_ref
@@ -37,14 +42,14 @@ def __init__(self, sequences_attention_mask_action_mask_ref: ray.ObjectRef, acti
class ExperienceMaker:
-
def __init__(self, kl_coef) -> None:
self.kl_coef = kl_coef
@torch.no_grad()
def make_experience(self, experiment_computation_refs: ExperienceCompositionRefs):
sequences, attention_mask, action_mask = ray.get(
- experiment_computation_refs.sequences_attention_mask_action_mask_ref)
+ experiment_computation_refs.sequences_attention_mask_action_mask_ref
+ )
action_log_probs = ray.get(experiment_computation_refs.action_log_probs_ref)
base_action_log_probs = ray.get(experiment_computation_refs.base_action_log_probs_ref)
r = ray.get(experiment_computation_refs.r_ref)
@@ -58,11 +63,10 @@ def make_experience(self, experiment_computation_refs: ExperienceCompositionRefs
class DistributedTorchRayActor:
-
def __init__(self, world_size, rank, local_rank, master_addr, master_port):
- logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
- level=logging.INFO,
- datefmt='%Y-%m-%d %H:%M:%S')
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+ )
self._model = None
self._world_size = world_size
self._rank = rank
@@ -82,7 +86,7 @@ def _get_current_node_ip():
@staticmethod
def _get_free_port():
with socket.socket() as sock:
- sock.bind(('', 0))
+ sock.bind(("", 0))
return sock.getsockname()[1]
def get_master_addr_port(self):
@@ -90,7 +94,6 @@ def get_master_addr_port(self):
class BasePPORole(DistributedTorchRayActor):
-
def add_experience_maker(self, kl_coef: float = 0.1):
self._experience_maker = ExperienceMaker(kl_coef)
@@ -99,12 +102,12 @@ def make_experience(self, experience_computation_ref: ExperienceCompositionRefs)
def _init_strategy(self, strategy: str):
# configure strategy
- if strategy == 'ddp':
+ if strategy == "ddp":
self._strategy = DDPStrategy()
- elif strategy == 'colossalai_gemini':
- self._strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
- elif strategy == 'colossalai_zero2':
- self._strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
+ elif strategy == "colossalai_gemini":
+ self._strategy = GeminiStrategy(placement_policy="cuda", initial_scale=2**5)
+ elif strategy == "colossalai_zero2":
+ self._strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
else:
raise ValueError(f'Unsupported strategy "{strategy}"')
@@ -124,11 +127,9 @@ def _prepare_model_with_strategy(self, has_optimizer: bool):
def _load_model_from_pretrained(self, model_class: Type[LoRAModule], pretrain: str):
raise NotImplementedError()
- def init_model_from_pretrained(self,
- strategy: str,
- model_class: Type[LoRAModule],
- pretrain: str,
- has_optimizer=False):
+ def init_model_from_pretrained(
+ self, strategy: str, model_class: Type[LoRAModule], pretrain: str, has_optimizer=False
+ ):
self._init_strategy(strategy)
self._load_model_from_pretrained(model_class, pretrain)
self._prepare_model_with_strategy(has_optimizer)
@@ -138,7 +139,6 @@ def eval(self):
class TrainablePPORole(BasePPORole):
-
def _load_model_from_pretrained(self, model_class, pretrain):
with self._strategy.model_init_context():
self._model = model_class(pretrain).to(torch.cuda.current_device())
@@ -161,38 +161,39 @@ def learn_on_experiences(self, experience_refs):
@ray.remote(num_gpus=1)
class RayPPOActor(TrainablePPORole):
-
def set_loss_function(self, eps_clip: float):
self._actor_loss_fn = PolicyLoss(eps_clip)
def load_tokenizer_from_pretrained(self, model_type: str, pretrained):
- if model_type == 'gpt2':
+ if model_type == "gpt2":
self._model_tokenizer = GPT2Tokenizer.from_pretrained(pretrained)
self._model_tokenizer.pad_token = self._model_tokenizer.eos_token
- elif model_type == 'bloom':
+ elif model_type == "bloom":
self._model_tokenizer = BloomTokenizerFast.from_pretrained(pretrained)
self._model_tokenizer.pad_token = self._model_tokenizer.eos_token
- elif model_type == 'opt':
+ elif model_type == "opt":
self._model_tokenizer = AutoTokenizer.from_pretrained(pretrained)
else:
raise ValueError(f'Unsupported model "{model_type}"')
# Set tokenize function for sequence generation
def _text_input_tokenize_fn(texts):
- batch = self._model_tokenizer(texts, return_tensors='pt', max_length=96, padding=True, truncation=True)
+ batch = self._model_tokenizer(texts, return_tensors="pt", max_length=96, padding=True, truncation=True)
return {k: v.cuda() for k, v in batch.items()}
self._sample_tokenize_function = _text_input_tokenize_fn
def setup_generate_kwargs(self, generate_kwargs: dict):
from coati.trainer.ppo import _set_default_generate_kwargs
+
self._generate_kwargs = _set_default_generate_kwargs(self._strategy, generate_kwargs, self._model)
- self._generate_kwargs['pad_token_id'] = self._model_tokenizer.pad_token_id
- self._generate_kwargs['eos_token_id'] = self._model_tokenizer.eos_token_id
+ self._generate_kwargs["pad_token_id"] = self._model_tokenizer.pad_token_id
+ self._generate_kwargs["eos_token_id"] = self._model_tokenizer.eos_token_id
def load_csv_prompt_file_from_url_to_sampler(self, prompt_url):
import pandas as pd
- prompts = pd.read_csv(prompt_url)['prompt']
+
+ prompts = pd.read_csv(prompt_url)["prompt"]
self._sampler = self._strategy.setup_sampler(prompts)
def _generate(self, input_ids, **generate_kwargs):
@@ -214,10 +215,9 @@ def calculate_action_log_probs(self, sequence_attention_action_mask):
def _training_step(self, experience):
num_actions = experience.action_mask.size(1)
action_log_probs = self._model(experience.sequences, num_actions, attention_mask=experience.attention_mask)
- actor_loss = self._actor_loss_fn(action_log_probs,
- experience.action_log_probs,
- experience.advantages,
- action_mask=experience.action_mask)
+ actor_loss = self._actor_loss_fn(
+ action_log_probs, experience.action_log_probs, experience.advantages, action_mask=experience.action_mask
+ )
self._strategy.backward(actor_loss, self._model, self._optimizer)
self._strategy.optimizer_step(self._optimizer)
self._optimizer.zero_grad()
@@ -229,17 +229,18 @@ def save_checkpoint(self, save_path, should_save_optimizer: bool):
self._strategy.save_model(self._model, save_path, only_rank0=True)
# save optimizer checkpoint on all ranks
if should_save_optimizer:
- self._strategy.save_optimizer(self._optimizer,
- 'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
- only_rank0=False)
+ self._strategy.save_optimizer(
+ self._optimizer,
+ "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()),
+ only_rank0=False,
+ )
def generate_answer(self, prompt, max_length=30, num_return_sequences=5):
- encoded_input = self._model_tokenizer(prompt, return_tensors='pt')
+ encoded_input = self._model_tokenizer(prompt, return_tensors="pt")
input_ids = {k: v.cuda() for k, v in encoded_input.items()}
- sequence, _ = self._model.generate(**input_ids,
- max_length=max_length,
- return_action_mask=False,
- num_return_sequences=num_return_sequences)
+ sequence, _ = self._model.generate(
+ **input_ids, max_length=max_length, return_action_mask=False, num_return_sequences=num_return_sequences
+ )
token_list = list(sequence.data[0])
output = " ".join([self._model_tokenizer.decode(token) for token in token_list])
return output
@@ -247,18 +248,16 @@ def generate_answer(self, prompt, max_length=30, num_return_sequences=5):
@ray.remote(num_gpus=1)
class RayPPOCritic(TrainablePPORole):
-
def set_loss_function(self, value_clip: float):
self._critic_loss_fn = ValueLoss(value_clip)
def _training_step(self, experience):
- values = self._model(experience.sequences,
- action_mask=experience.action_mask,
- attention_mask=experience.attention_mask)
- critic_loss = self._critic_loss_fn(values,
- experience.values,
- experience.reward,
- action_mask=experience.action_mask)
+ values = self._model(
+ experience.sequences, action_mask=experience.action_mask, attention_mask=experience.attention_mask
+ )
+ critic_loss = self._critic_loss_fn(
+ values, experience.values, experience.reward, action_mask=experience.action_mask
+ )
self._strategy.backward(critic_loss, self._model, self._optimizer)
self._strategy.optimizer_step(self._optimizer)
self._optimizer.zero_grad()
@@ -272,12 +271,12 @@ def calculate_value(self, sequence_attention_action_mask):
@ray.remote(num_gpus=1)
class RayPPORewardModel(BasePPORole):
-
def _load_model_from_pretrained(self, model_class, pretrain):
with self._strategy.model_init_context():
critic = model_class(pretrained=pretrain).to(torch.cuda.current_device())
- self._model = RewardModel(deepcopy(critic.model),
- deepcopy(critic.value_head)).to(torch.cuda.current_device())
+ self._model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).to(
+ torch.cuda.current_device()
+ )
@torch.no_grad()
def calculate_r(self, sequence_attention_action_mask):
@@ -287,7 +286,6 @@ def calculate_r(self, sequence_attention_action_mask):
@ray.remote(num_gpus=1)
class RayPPOInitialModel(BasePPORole):
-
def _load_model_from_pretrained(self, model_class, pretrain):
with self._strategy.model_init_context():
self._model = model_class(pretrain).to(torch.cuda.current_device())
@@ -300,8 +298,8 @@ def calculate_base_action_log_probs(self, sequence_attention_action_mask):
class PPORayActorGroup:
"""
- A group of ray actors
- Functions start with 'async' should return list of object refs
+ A group of ray actors
+ Functions start with 'async' should return list of object refs
"""
def __init__(self, num_nodes, num_gpus_per_node, ray_actor_type: Type[BasePPORole]) -> None:
@@ -319,8 +317,9 @@ def _initiate_actors(self):
pg = placement_group(bundles, strategy="STRICT_SPREAD")
ray.get(pg.ready())
if pg:
- master_actor = self.ray_actor_type.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
- placement_group=pg, placement_group_bundle_index=0)).remote(world_size, 0, 0, None, None)
+ master_actor = self.ray_actor_type.options(
+ scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg, placement_group_bundle_index=0)
+ ).remote(world_size, 0, 0, None, None)
else:
master_actor = self.ray_actor_type.options(num_gpus=1).remote(world_size, 0, 0, None, None)
self._actor_handlers = [master_actor]
@@ -331,16 +330,20 @@ def _initiate_actors(self):
for rank in range(1, world_size):
local_rank = rank % self._num_gpus_per_node
if pg:
- worker_actor = self.ray_actor_type.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
- placement_group=pg, placement_group_bundle_index=rank // self._num_gpus_per_node)).remote(
- world_size, rank, local_rank, master_addr, master_port)
+ worker_actor = self.ray_actor_type.options(
+ scheduling_strategy=PlacementGroupSchedulingStrategy(
+ placement_group=pg, placement_group_bundle_index=rank // self._num_gpus_per_node
+ )
+ ).remote(world_size, rank, local_rank, master_addr, master_port)
else:
- worker_actor = self.ray_actor_type.options(num_gpus=1).remote(world_size, rank, local_rank,
- master_addr, master_port)
+ worker_actor = self.ray_actor_type.options(num_gpus=1).remote(
+ world_size, rank, local_rank, master_addr, master_port
+ )
self._actor_handlers.append(worker_actor)
- def async_init_model_from_pretrained(self, strategy: str, model_class: Type[LoRAModule], pretrain: str,
- has_optimizer: bool):
+ def async_init_model_from_pretrained(
+ self, strategy: str, model_class: Type[LoRAModule], pretrain: str, has_optimizer: bool
+ ):
return [
actor.init_model_from_pretrained.remote(strategy, model_class, pretrain, has_optimizer)
for actor in self._actor_handlers
@@ -348,7 +351,6 @@ def async_init_model_from_pretrained(self, strategy: str, model_class: Type[LoRA
class TrainableModelRayActorGroup(PPORayActorGroup):
-
def async_learn_on_experiences(self, experience_refs):
num_actors = len(self._actor_handlers)
learn_result_refs = []
@@ -359,7 +361,6 @@ def async_learn_on_experiences(self, experience_refs):
class PPOActorRayActorGroup(TrainableModelRayActorGroup):
-
def __init__(self, num_nodes, num_gpus_per_node) -> None:
super().__init__(num_nodes, num_gpus_per_node, RayPPOActor)
@@ -381,7 +382,8 @@ def async_calculate_action_log_probs(self, sequences_attention_mask_action_mask_
action_log_probs_refs = []
for i in range(len(sequences_attention_mask_action_mask_refs)):
action_log_probs_ref = self._actor_handlers[i % num_actors].calculate_action_log_probs.remote(
- sequences_attention_mask_action_mask_refs[i])
+ sequences_attention_mask_action_mask_refs[i]
+ )
action_log_probs_refs.append(action_log_probs_ref)
return action_log_probs_refs
@@ -393,7 +395,6 @@ def save_checkpoint(self, save_path, should_save_optimizer):
class PPOCriticRayActorGroup(TrainableModelRayActorGroup):
-
def __init__(self, num_nodes, num_gpus_per_node) -> None:
super().__init__(num_nodes, num_gpus_per_node, RayPPOCritic)
@@ -402,7 +403,8 @@ def async_calculate_value(self, sequences_attention_mask_action_mask_refs):
value_refs = []
for i in range(len(sequences_attention_mask_action_mask_refs)):
value_ref = self._actor_handlers[i % num_actors].calculate_value.remote(
- sequences_attention_mask_action_mask_refs[i])
+ sequences_attention_mask_action_mask_refs[i]
+ )
value_refs.append(value_ref)
return value_refs
@@ -411,7 +413,6 @@ def set_loss_function(self, value_clip: float = 0.4):
class PPOInitialRayActorGroup(PPORayActorGroup):
-
def __init__(self, num_nodes, num_gpus_per_node) -> None:
super().__init__(num_nodes, num_gpus_per_node, RayPPOInitialModel)
@@ -420,13 +421,13 @@ def async_calculate_base_action_log_probs(self, sequences_attention_mask_action_
base_action_log_probs_refs = []
for i in range(len(sequences_attention_mask_action_mask_refs)):
base_action_log_probs_ref = self._actor_handlers[i % num_actors].calculate_base_action_log_probs.remote(
- sequences_attention_mask_action_mask_refs[i])
+ sequences_attention_mask_action_mask_refs[i]
+ )
base_action_log_probs_refs.append(base_action_log_probs_ref)
return base_action_log_probs_refs
class PPORewardRayActorGroup(PPORayActorGroup):
-
def __init__(self, num_nodes, num_gpus_per_node) -> None:
super().__init__(num_nodes, num_gpus_per_node, RayPPORewardModel)
@@ -435,20 +436,21 @@ def async_calculate_r(self, sequences_attention_mask_action_mask_refs):
r_refs = []
for i in range(len(sequences_attention_mask_action_mask_refs)):
r_ref = self._actor_handlers[i % num_actors].calculate_r.remote(
- sequences_attention_mask_action_mask_refs[i])
+ sequences_attention_mask_action_mask_refs[i]
+ )
r_refs.append(r_ref)
return r_refs
def main(args):
- logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
- level=logging.INFO,
- datefmt='%Y-%m-%d %H:%M:%S')
- if args.model == 'gpt2':
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+ )
+ if args.model == "gpt2":
actor_model_class, critic_model_class = GPTActor, GPTCritic
- elif args.model == 'bloom':
+ elif args.model == "bloom":
actor_model_class, critic_model_class = BLOOMActor, BLOOMCritic
- elif args.model == 'opt':
+ elif args.model == "opt":
actor_model_class, critic_model_class = OPTActor, OPTCritic
else:
raise ValueError(f'Unsupported model "{args.model}"')
@@ -462,13 +464,14 @@ def main(args):
logging.info("Actors created")
# Prepare model for training
- generate_kwargs = {'max_length': 128, 'do_sample': True, 'temperature': 1.0, 'top_k': 50}
+ generate_kwargs = {"max_length": 128, "do_sample": True, "temperature": 1.0, "top_k": 50}
ray.get(
- actor_group.async_init_model_from_pretrained(args.strategy, actor_model_class, args.pretrain, True) +
- critic_group.async_init_model_from_pretrained(args.strategy, critic_model_class, args.pretrain, True) +
- initial_group.async_init_model_from_pretrained(args.strategy, actor_model_class, args.pretrain, False) +
- reward_group.async_init_model_from_pretrained(args.strategy, critic_model_class, args.pretrain, False) +
- actor_group.async_prepare_for_sequence_generation(args.model, args.pretrain, generate_kwargs))
+ actor_group.async_init_model_from_pretrained(args.strategy, actor_model_class, args.pretrain, True)
+ + critic_group.async_init_model_from_pretrained(args.strategy, critic_model_class, args.pretrain, True)
+ + initial_group.async_init_model_from_pretrained(args.strategy, actor_model_class, args.pretrain, False)
+ + reward_group.async_init_model_from_pretrained(args.strategy, critic_model_class, args.pretrain, False)
+ + actor_group.async_prepare_for_sequence_generation(args.model, args.pretrain, generate_kwargs)
+ )
logging.info("Models prepared for training")
# Prepare models for training
@@ -483,8 +486,12 @@ def main(args):
# Start training
logging.info("Training start")
# Set all models to eval and add experience maker
- all_ray_actors = actor_group._actor_handlers + critic_group._actor_handlers + \
- initial_group._actor_handlers + reward_group._actor_handlers
+ all_ray_actors = (
+ actor_group._actor_handlers
+ + critic_group._actor_handlers
+ + initial_group._actor_handlers
+ + reward_group._actor_handlers
+ )
num_ray_actors = len(all_ray_actors)
ray.get([ray_actor.eval.remote() for ray_actor in all_ray_actors])
ray.get([ray_actor.add_experience_maker.remote() for ray_actor in all_ray_actors])
@@ -497,18 +504,28 @@ def main(args):
time += 1
# Experience queueing stage
sequences_attention_mask_action_mask_refs = actor_group.async_sample_prompts_and_make_sequence(
- experience_batch_size)
+ experience_batch_size
+ )
base_action_log_probs_refs = initial_group.async_calculate_base_action_log_probs(
- sequences_attention_mask_action_mask_refs)
+ sequences_attention_mask_action_mask_refs
+ )
values_refs = critic_group.async_calculate_value(sequences_attention_mask_action_mask_refs)
r_refs = reward_group.async_calculate_r(sequences_attention_mask_action_mask_refs)
action_log_probs_refs = actor_group.async_calculate_action_log_probs(
- sequences_attention_mask_action_mask_refs)
- experience_composition_refs.extend([
- ExperienceCompositionRefs(sequences_attention_mask_action_mask_refs[i], action_log_probs_refs[i],
- base_action_log_probs_refs[i], values_refs[i], r_refs[i])
- for i in range(len(sequences_attention_mask_action_mask_refs))
- ])
+ sequences_attention_mask_action_mask_refs
+ )
+ experience_composition_refs.extend(
+ [
+ ExperienceCompositionRefs(
+ sequences_attention_mask_action_mask_refs[i],
+ action_log_probs_refs[i],
+ base_action_log_probs_refs[i],
+ values_refs[i],
+ r_refs[i],
+ )
+ for i in range(len(sequences_attention_mask_action_mask_refs))
+ ]
+ )
# Learning stage
if time % update_timesteps == 0:
experience_refs = []
@@ -519,8 +536,9 @@ def main(args):
experience_refs.append(selected_ray_actor.make_experience.remote(exp_composition_ref))
# backward
ray.get(
- actor_group.async_learn_on_experiences(experience_refs) +
- critic_group.async_learn_on_experiences(experience_refs))
+ actor_group.async_learn_on_experiences(experience_refs)
+ + critic_group.async_learn_on_experiences(experience_refs)
+ )
# clear refs queue
experience_composition_refs.clear()
logging.info("Training finished")
@@ -528,26 +546,24 @@ def main(args):
actor_group.save_checkpoint(args.save_path, args.need_optim_ckpt)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--prompt_csv_url', type=str)
- parser.add_argument('--strategy',
- choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
- default='ddp')
- parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
- parser.add_argument('--pretrain', type=str, default='gpt2')
- parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
- parser.add_argument('--need_optim_ckpt', type=bool, default=False)
- parser.add_argument('--num_episodes', type=int, default=10)
- parser.add_argument('--max_timesteps', type=int, default=10)
- parser.add_argument('--update_timesteps', type=int, default=10)
- parser.add_argument('--train_batch_size', type=int, default=8)
- parser.add_argument('--experience_batch_size', type=int, default=8)
- parser.add_argument('--num_actor_nodes', type=int, help='num of nodes to use to host actor model', default=1)
- parser.add_argument('--num_critic_nodes', type=int, help='num of nodes to use to host critic model', default=1)
- parser.add_argument('--num_initial_nodes', type=int, help='num of nodes to use to host initial model', default=1)
- parser.add_argument('--num_reward_nodes', type=int, help='num of nodes to use to host reward model', default=1)
- parser.add_argument('--num_gpus_per_node', type=int, help='num of gpus on a ray node', default=1)
+ parser.add_argument("--prompt_csv_url", type=str)
+ parser.add_argument("--strategy", choices=["ddp", "colossalai_gemini", "colossalai_zero2"], default="ddp")
+ parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt"])
+ parser.add_argument("--pretrain", type=str, default="gpt2")
+ parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts.pt")
+ parser.add_argument("--need_optim_ckpt", type=bool, default=False)
+ parser.add_argument("--num_episodes", type=int, default=10)
+ parser.add_argument("--max_timesteps", type=int, default=10)
+ parser.add_argument("--update_timesteps", type=int, default=10)
+ parser.add_argument("--train_batch_size", type=int, default=8)
+ parser.add_argument("--experience_batch_size", type=int, default=8)
+ parser.add_argument("--num_actor_nodes", type=int, help="num of nodes to use to host actor model", default=1)
+ parser.add_argument("--num_critic_nodes", type=int, help="num of nodes to use to host critic model", default=1)
+ parser.add_argument("--num_initial_nodes", type=int, help="num of nodes to use to host initial model", default=1)
+ parser.add_argument("--num_reward_nodes", type=int, help="num of nodes to use to host reward model", default=1)
+ parser.add_argument("--num_gpus_per_node", type=int, help="num of gpus on a ray node", default=1)
args = parser.parse_args()
ray.init()
main(args)
diff --git a/applications/Chat/examples/download_model.py b/applications/Chat/examples/download_model.py
index c2b5f9a859a9..ec3482b5f789 100644
--- a/applications/Chat/examples/download_model.py
+++ b/applications/Chat/examples/download_model.py
@@ -22,7 +22,7 @@ def download(self, dir_path: str):
file_path = hf_hub_download(self.repo_id, file, local_dir=dir_path)
def download_all(self):
- file_path = snapshot_download(self.repo_id)
+ snapshot_download(self.repo_id)
def test_init(model: str, dir_path: str):
@@ -31,19 +31,19 @@ def test_init(model: str, dir_path: str):
actor = GPTActor(config=config)
critic = GPTCritic(config=config)
reward_model = GPTRM(config=config)
- tokenizer = GPT2Tokenizer.from_pretrained(dir_path)
+ GPT2Tokenizer.from_pretrained(dir_path)
elif model == "bloom":
config = BloomConfig.from_pretrained(dir_path)
actor = BLOOMActor(config=config)
critic = BLOOMCritic(config=config)
reward_model = BLOOMRM(config=config)
- tokenizer = BloomTokenizerFast.from_pretrained(dir_path)
+ BloomTokenizerFast.from_pretrained(dir_path)
elif model == "opt":
config = AutoConfig.from_pretrained(dir_path)
actor = OPTActor(config=config)
critic = OPTCritic(config=config)
reward_model = OPTRM(config=config)
- tokenizer = AutoTokenizer.from_pretrained(dir_path)
+ AutoTokenizer.from_pretrained(dir_path)
else:
raise NotImplementedError(f"Model {model} not implemented")
@@ -59,17 +59,12 @@ def test_init(model: str, dir_path: str):
exit(0)
repo_list = {
- "gpt2": HFRepoFiles(
- repo_id="gpt2",
- files=["config.json", "tokenizer.json", "vocab.json", "merges.txt"]
- ),
+ "gpt2": HFRepoFiles(repo_id="gpt2", files=["config.json", "tokenizer.json", "vocab.json", "merges.txt"]),
"bloom": HFRepoFiles(
- repo_id="bigscience/bloom-560m",
- files=["config.json", "tokenizer.json", "tokenizer_config.json"]
+ repo_id="bigscience/bloom-560m", files=["config.json", "tokenizer.json", "tokenizer_config.json"]
),
"opt": HFRepoFiles(
- repo_id="facebook/opt-350m",
- files=["config.json", "tokenizer_config.json", "vocab.json", "merges.txt"]
+ repo_id="facebook/opt-350m", files=["config.json", "tokenizer_config.json", "vocab.json", "merges.txt"]
),
}
diff --git a/applications/Chat/examples/generate_conversation_dataset.py b/applications/Chat/examples/generate_conversation_dataset.py
index 8d2fbba955b8..7e03b2d54260 100644
--- a/applications/Chat/examples/generate_conversation_dataset.py
+++ b/applications/Chat/examples/generate_conversation_dataset.py
@@ -31,9 +31,11 @@ def generate_alpaca():
def generate_sharegpt():
# ShareGPT data requires less processing.
conversation_dataset = []
- dataset = load_dataset("anon8231489123/ShareGPT_Vicuna_unfiltered",
- data_files="ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json",
- split="train")
+ dataset = load_dataset(
+ "anon8231489123/ShareGPT_Vicuna_unfiltered",
+ data_files="ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json",
+ split="train",
+ )
conversations = dataset["conversations"]
@@ -43,23 +45,24 @@ def generate_sharegpt():
del conv["markdown"]
del conv["text"]
- conversation = dict(type="conversation",
- language="Multilingual",
- dataset="ShareGPT",
- conversations=conversations[idx])
+ conversation = dict(
+ type="conversation", language="Multilingual", dataset="ShareGPT", conversations=conversations[idx]
+ )
conversation_dataset.append(conversation)
return conversation_dataset
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--dataset',
- type=str,
- default="All",
- choices=["Alpaca", "ShareGPT", "All"],
- help="which dataset to convert, All will combine Alpaca and ShareGPT")
- parser.add_argument('--save_path', type=str, default="dataset.json", help="path to save the converted dataset")
+ parser.add_argument(
+ "--dataset",
+ type=str,
+ default="All",
+ choices=["Alpaca", "ShareGPT", "All"],
+ help="which dataset to convert, All will combine Alpaca and ShareGPT",
+ )
+ parser.add_argument("--save_path", type=str, default="dataset.json", help="path to save the converted dataset")
args = parser.parse_args()
conversation_dataset = []
@@ -75,5 +78,5 @@ def generate_sharegpt():
for idx, sample in enumerate(conversation_dataset):
sample["id"] = idx + 1
- with open(args.save_path, mode='w') as f:
+ with open(args.save_path, mode="w") as f:
json.dump(conversation_dataset, f, indent=4, default=str, ensure_ascii=False)
diff --git a/applications/Chat/examples/generate_prompt_dataset.py b/applications/Chat/examples/generate_prompt_dataset.py
index 2abb31c09f82..4eec6feae505 100644
--- a/applications/Chat/examples/generate_prompt_dataset.py
+++ b/applications/Chat/examples/generate_prompt_dataset.py
@@ -6,7 +6,7 @@
def sample(args):
- with open(args.dataset_path, mode='r') as f:
+ with open(args.dataset_path, mode="r") as f:
dataset_list = json.load(f)
sampled_dataset = [
@@ -14,18 +14,14 @@ def sample(args):
for idx, sample in enumerate(random.sample(dataset_list, args.sample_size))
]
- with open(args.save_path, mode='w') as f:
- json.dump(sampled_dataset, f, indent=4,
- default=str, ensure_ascii=False)
+ with open(args.save_path, mode="w") as f:
+ json.dump(sampled_dataset, f, indent=4, default=str, ensure_ascii=False)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--dataset_path', type=str, default=None,
- required=True, help="path to the pretrain dataset")
- parser.add_argument('--save_path', type=str, default='prompt.json',
- help="path to save the prompt dataset")
- parser.add_argument('--sample_size', type=int,
- default=16384, help="size of the prompt dataset")
+ parser.add_argument("--dataset_path", type=str, default=None, required=True, help="path to the pretrain dataset")
+ parser.add_argument("--save_path", type=str, default="prompt.json", help="path to save the prompt dataset")
+ parser.add_argument("--sample_size", type=int, default=16384, help="size of the prompt dataset")
args = parser.parse_args()
sample(args)
diff --git a/applications/Chat/examples/inference.py b/applications/Chat/examples/inference.py
index e1e57e3cd376..9df8649d9c61 100644
--- a/applications/Chat/examples/inference.py
+++ b/applications/Chat/examples/inference.py
@@ -11,13 +11,13 @@
def eval(args):
# configure model
- if args.model == 'gpt2':
+ if args.model == "gpt2":
actor = GPTActor(pretrained=args.pretrain)
- elif args.model == 'bloom':
+ elif args.model == "bloom":
actor = BLOOMActor(pretrained=args.pretrain)
- elif args.model == 'opt':
+ elif args.model == "opt":
actor = OPTActor(pretrained=args.pretrain)
- elif args.model == 'llama':
+ elif args.model == "llama":
actor = LlamaActor(pretrained=args.pretrain)
else:
raise ValueError(f'Unsupported model "{args.model}"')
@@ -28,45 +28,46 @@ def eval(args):
actor.load_state_dict(state_dict)
# configure tokenizer
- if args.model == 'gpt2':
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+ if args.model == "gpt2":
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'bloom':
- tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
+ elif args.model == "bloom":
+ tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'opt':
+ elif args.model == "opt":
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'llama':
+ elif args.model == "llama":
tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
- tokenizer.eos_token = '<\s>'
+ tokenizer.eos_token = ""
tokenizer.pad_token = tokenizer.unk_token
else:
raise ValueError(f'Unsupported model "{args.model}"')
actor.eval()
- input_ids = tokenizer.encode(args.input,
- return_tensors='pt')\
- .to(torch.cuda.current_device())
- outputs = generate(actor,
- input_ids,
- max_length=args.max_length,
- do_sample=True,
- top_k=50,
- top_p=0.95,
- num_return_sequences=1)
- output = tokenizer.batch_decode(outputs[0],
- skip_special_tokens=True)
+ tokenizer.padding_side = "left"
+ input_ids = tokenizer.encode(args.input, return_tensors="pt").to(torch.cuda.current_device())
+ outputs = generate(
+ actor,
+ input_ids,
+ tokenizer=tokenizer,
+ max_length=args.max_length,
+ do_sample=True,
+ top_k=50,
+ top_p=0.95,
+ num_return_sequences=1,
+ )
+ output = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
print(f"[Output]: {''.join(output)}")
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+ parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
# We suggest to use the pretrained model from HuggingFace, use pretrain to configure model
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--model_path', type=str, default=None)
- parser.add_argument('--input', type=str, default='Question: How are you ? Answer:')
- parser.add_argument('--max_length', type=int, default=100)
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--model_path", type=str, default=None)
+ parser.add_argument("--input", type=str, default="Question: How are you ? Answer:")
+ parser.add_argument("--max_length", type=int, default=100)
args = parser.parse_args()
eval(args)
diff --git a/applications/Chat/examples/ray/1mmt_prompt.py b/applications/Chat/examples/ray/1mmt_prompt.py
index 5dd52f1790e6..8de6219ec4e9 100644
--- a/applications/Chat/examples/ray/1mmt_prompt.py
+++ b/applications/Chat/examples/ray/1mmt_prompt.py
@@ -5,7 +5,6 @@
import pandas as pd
import ray
-import torch
from coati.quant import llama_load_quant, low_resource_init
from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
from coati.ray.experience_maker_holder import ExperienceMakerHolder
@@ -23,13 +22,13 @@
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
- s.bind(('', 0))
+ s.bind(("", 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
- s.connect(('8.8.8.8', 80))
+ s.connect(("8.8.8.8", 80))
return s.getsockname()[0]
@@ -37,22 +36,25 @@ def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
- env_info_trainers = [{
- 'local_rank': '0',
- 'rank': str(rank),
- 'world_size': str(args.num_trainers),
- 'master_port': trainer_port,
- 'master_addr': master_addr
- } for rank in range(args.num_trainers)]
+ env_info_trainers = [
+ {
+ "local_rank": "0",
+ "rank": str(rank),
+ "world_size": str(args.num_trainers),
+ "master_port": trainer_port,
+ "master_addr": master_addr,
+ }
+ for rank in range(args.num_trainers)
+ ]
# maker_env_info
maker_port = str(get_free_port())
env_info_maker = {
- 'local_rank': '0',
- 'rank': '0',
- 'world_size': '1',
- 'master_port': maker_port,
- 'master_addr': master_addr
+ "local_rank": "0",
+ "rank": "0",
+ "world_size": "1",
+ "master_port": maker_port,
+ "master_addr": master_addr,
}
# configure tokenizer
@@ -75,27 +77,33 @@ def trainer_model_fn():
eval_performance=True,
debug=args.debug,
update_lora_weights=not (args.lora_rank == 0),
- ) for i, env_info_trainer in enumerate(env_info_trainers)
+ )
+ for i, env_info_trainer in enumerate(env_info_trainers)
]
def model_fn():
actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
- if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+ if args.initial_model_quant_ckpt is not None and args.model == "llama":
# quantize initial model
actor_cfg = AutoConfig.from_pretrained(args.pretrain)
with low_resource_init(), no_init_weights():
initial_model = get_actor_from_args(args.model, config=actor_cfg)
- initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
- args.quant_group_size).cuda().requires_grad_(False)
+ initial_model.model = (
+ llama_load_quant(
+ initial_model.model, args.initial_model_quant_ckpt, args.quant_bits, args.quant_group_size
+ )
+ .cuda()
+ .requires_grad_(False)
+ )
else:
initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
return actor, critic, reward_model, initial_model
# configure Experience Maker
experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
- detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
+ detached_trainer_name_list=[f"trainer{i}" for i in range(args.num_trainers)],
strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
model_fn=model_fn,
env_info=env_info_maker,
@@ -130,12 +138,11 @@ def model_fn():
dataset_size = args.experience_batch_size * 4
def build_dataloader():
-
def tokenize_fn(texts):
- batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+ batch = tokenizer(texts, return_tensors="pt", max_length=96, padding="max_length", truncation=True)
return {k: v.cuda() for k, v in batch.items()}
- dataset = pd.read_csv(args.prompt_path)['prompt']
+ dataset = pd.read_csv(args.prompt_path)["prompt"]
dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
return dataloader
@@ -144,32 +151,31 @@ def tokenize_fn(texts):
ray.get(wait_tasks)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--prompt_path', type=str, default=None)
- parser.add_argument('--num_trainers', type=int, default=1)
- parser.add_argument('--trainer_strategy',
- choices=[
- 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
- 'colossalai_zero2_cpu'
- ],
- default='ddp')
- parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
- parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--critic_pretrain', type=str, default=None)
- parser.add_argument('--experience_steps', type=int, default=4)
- parser.add_argument('--experience_batch_size', type=int, default=8)
- parser.add_argument('--train_epochs', type=int, default=1)
- parser.add_argument('--update_steps', type=int, default=2)
- parser.add_argument('--train_batch_size', type=int, default=8)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
- parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
- parser.add_argument('--quant_bits', type=int, default=4)
- parser.add_argument('--quant_group_size', type=int, default=128)
- parser.add_argument('--debug', action='store_true')
+ parser.add_argument("--prompt_path", type=str, default=None)
+ parser.add_argument("--num_trainers", type=int, default=1)
+ parser.add_argument(
+ "--trainer_strategy",
+ choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_gemini_cpu", "colossalai_zero2_cpu"],
+ default="ddp",
+ )
+ parser.add_argument("--maker_strategy", choices=["naive"], default="naive")
+ parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--critic_model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--critic_pretrain", type=str, default=None)
+ parser.add_argument("--experience_steps", type=int, default=4)
+ parser.add_argument("--experience_batch_size", type=int, default=8)
+ parser.add_argument("--train_epochs", type=int, default=1)
+ parser.add_argument("--update_steps", type=int, default=2)
+ parser.add_argument("--train_batch_size", type=int, default=8)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+
+ parser.add_argument("--initial_model_quant_ckpt", type=str, default=None)
+ parser.add_argument("--quant_bits", type=int, default=4)
+ parser.add_argument("--quant_group_size", type=int, default=128)
+ parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
main(args)
diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/Chat/examples/ray/mmmt_prompt.py
index 76929c9d0144..7c03a0468b02 100644
--- a/applications/Chat/examples/ray/mmmt_prompt.py
+++ b/applications/Chat/examples/ray/mmmt_prompt.py
@@ -5,7 +5,6 @@
import pandas as pd
import ray
-import torch
from coati.quant import llama_load_quant, low_resource_init
from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
from coati.ray.experience_maker_holder import ExperienceMakerHolder
@@ -23,13 +22,13 @@
def get_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
- s.bind(('', 0))
+ s.bind(("", 0))
return s.getsockname()[1]
def get_local_ip():
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
- s.connect(('8.8.8.8', 80))
+ s.connect(("8.8.8.8", 80))
return s.getsockname()[0]
@@ -37,23 +36,29 @@ def main(args):
master_addr = str(get_local_ip())
# trainer_env_info
trainer_port = str(get_free_port())
- env_info_trainers = [{
- 'local_rank': '0',
- 'rank': str(rank),
- 'world_size': str(args.num_trainers),
- 'master_port': trainer_port,
- 'master_addr': master_addr
- } for rank in range(args.num_trainers)]
+ env_info_trainers = [
+ {
+ "local_rank": "0",
+ "rank": str(rank),
+ "world_size": str(args.num_trainers),
+ "master_port": trainer_port,
+ "master_addr": master_addr,
+ }
+ for rank in range(args.num_trainers)
+ ]
# maker_env_info
maker_port = str(get_free_port())
- env_info_makers = [{
- 'local_rank': '0',
- 'rank': str(rank),
- 'world_size': str(args.num_makers),
- 'master_port': maker_port,
- 'master_addr': master_addr
- } for rank in range(args.num_makers)]
+ env_info_makers = [
+ {
+ "local_rank": "0",
+ "rank": str(rank),
+ "world_size": str(args.num_makers),
+ "master_port": maker_port,
+ "master_addr": master_addr,
+ }
+ for rank in range(args.num_makers)
+ ]
# configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
@@ -63,13 +68,18 @@ def model_fn():
actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
- if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+ if args.initial_model_quant_ckpt is not None and args.model == "llama":
# quantize initial model
actor_cfg = AutoConfig.from_pretrained(args.pretrain)
with low_resource_init(), no_init_weights():
initial_model = get_actor_from_args(args.model, config=actor_cfg)
- initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
- args.quant_group_size).cuda().requires_grad_(False)
+ initial_model.model = (
+ llama_load_quant(
+ initial_model.model, args.initial_model_quant_ckpt, args.quant_bits, args.quant_group_size
+ )
+ .cuda()
+ .requires_grad_(False)
+ )
else:
initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
return actor, critic, reward_model, initial_model
@@ -78,7 +88,7 @@ def model_fn():
experience_holder_refs = [
ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
detached_trainer_name_list=[
- f'trainer{x}'
+ f"trainer{x}"
for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
],
strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
@@ -87,8 +97,8 @@ def model_fn():
kl_coef=0.1,
debug=args.debug,
update_lora_weights=not (args.lora_rank == 0),
- # sync_models_from_trainers=True,
- # generation kwargs:
+ # sync_models_from_trainers=True,
+ # generation kwargs:
max_length=512,
do_sample=True,
temperature=1.0,
@@ -128,12 +138,11 @@ def trainer_model_fn():
dataset_size = args.experience_batch_size * 4
def build_dataloader():
-
def tokenize_fn(texts):
- batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+ batch = tokenizer(texts, return_tensors="pt", max_length=96, padding="max_length", truncation=True)
return {k: v.cuda() for k, v in batch.items()}
- dataset = pd.read_csv(args.prompt_path)['prompt']
+ dataset = pd.read_csv(args.prompt_path)["prompt"]
dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
return dataloader
@@ -148,39 +157,44 @@ def tokenize_fn(texts):
for experience_holder_ref in experience_holder_refs:
wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
- total_steps = args.experience_batch_size * args.experience_steps * \
- args.num_makers // (args.num_trainers * args.train_batch_size)
+ total_steps = (
+ args.experience_batch_size
+ * args.experience_steps
+ * args.num_makers
+ // (args.num_trainers * args.train_batch_size)
+ )
for trainer_ref in trainer_refs:
wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
ray.get(wait_tasks)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--prompt_path', type=str, default=None)
- parser.add_argument('--num_makers', type=int, default=1)
- parser.add_argument('--num_trainers', type=int, default=1)
+ parser.add_argument("--prompt_path", type=str, default=None)
+ parser.add_argument("--num_makers", type=int, default=1)
+ parser.add_argument("--num_trainers", type=int, default=1)
parser.add_argument(
- '--trainer_strategy',
- choices=['ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', 'colossalai_zero2_cpu'],
- default='ddp')
- parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
- parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--critic_pretrain', type=str, default=None)
- parser.add_argument('--experience_steps', type=int, default=4)
- parser.add_argument('--experience_batch_size', type=int, default=8)
- parser.add_argument('--train_epochs', type=int, default=1)
- parser.add_argument('--update_steps', type=int, default=2)
- parser.add_argument('--train_batch_size', type=int, default=8)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
- parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
- parser.add_argument('--quant_bits', type=int, default=4)
- parser.add_argument('--quant_group_size', type=int, default=128)
- parser.add_argument('--debug', action='store_true')
+ "--trainer_strategy",
+ choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_gemini_cpu", "colossalai_zero2_cpu"],
+ default="ddp",
+ )
+ parser.add_argument("--maker_strategy", choices=["naive"], default="naive")
+ parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--critic_model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--critic_pretrain", type=str, default=None)
+ parser.add_argument("--experience_steps", type=int, default=4)
+ parser.add_argument("--experience_batch_size", type=int, default=8)
+ parser.add_argument("--train_epochs", type=int, default=1)
+ parser.add_argument("--update_steps", type=int, default=2)
+ parser.add_argument("--train_batch_size", type=int, default=8)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+
+ parser.add_argument("--initial_model_quant_ckpt", type=str, default=None)
+ parser.add_argument("--quant_bits", type=int, default=4)
+ parser.add_argument("--quant_group_size", type=int, default=128)
+ parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
diff --git a/applications/Chat/examples/requirements.txt b/applications/Chat/examples/requirements.txt
index 5d0f9f927d17..5474dfa16b3e 100644
--- a/applications/Chat/examples/requirements.txt
+++ b/applications/Chat/examples/requirements.txt
@@ -1,3 +1,3 @@
pandas>=1.4.1
sentencepiece
-colossalai==0.3.1
\ No newline at end of file
+colossalai==0.3.3
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
index d27a70a3fef6..40e06043ab57 100644
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@@ -20,28 +20,32 @@
def main(args):
# configure strategy
- if args.strategy == 'ddp':
+ if args.strategy == "ddp":
strategy = DDPStrategy()
- elif args.strategy == 'colossalai_gemini':
- strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5)
- elif args.strategy == 'colossalai_zero2':
- strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
+ elif args.strategy == "colossalai_gemini":
+ strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
+ elif args.strategy == "colossalai_zero2":
+ strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
if args.rm_path is not None:
- warnings.warn('LoRA weights should be merged with the model weights')
- state_dict = torch.load(args.rm_path, map_location='cpu')
+ warnings.warn("LoRA weights should be merged with the model weights")
+ state_dict = torch.load(args.rm_path, map_location="cpu")
+
+ if args.lora_rank > 0:
+ warnings.warn("Lora is not supported yet.")
+ args.lora_rank = 0
with strategy.model_init_context():
# configure model
- if args.model == 'gpt2':
+ if args.model == "gpt2":
initial_model = GPTActor(pretrained=args.pretrain)
- elif args.model == 'bloom':
+ elif args.model == "bloom":
initial_model = BLOOMActor(pretrained=args.pretrain)
- elif args.model == 'opt':
+ elif args.model == "opt":
initial_model = OPTActor(pretrained=args.pretrain)
- elif args.model == 'llama':
+ elif args.model == "llama":
initial_model = LlamaActor(pretrained=args.pretrain)
else:
raise ValueError(f'Unsupported actor model "{args.model}"')
@@ -51,13 +55,13 @@ def main(args):
else:
rm_model_name = args.rm_model
- if rm_model_name == 'gpt2':
+ if rm_model_name == "gpt2":
reward_model = GPTRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
- elif rm_model_name == 'bloom':
+ elif rm_model_name == "bloom":
reward_model = BLOOMRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
- elif rm_model_name == 'opt':
+ elif rm_model_name == "opt":
reward_model = OPTRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
- elif rm_model_name == 'llama':
+ elif rm_model_name == "llama":
reward_model = LlamaRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
else:
raise ValueError(f'Unsupported reward model "{rm_model_name}"')
@@ -65,28 +69,28 @@ def main(args):
if args.rm_path is not None:
reward_model.load_state_dict(state_dict, strict=False)
- initial_model.to(torch.float16).to(torch.cuda.current_device())
- reward_model.to(torch.float16).to(torch.cuda.current_device())
+ initial_model.to(torch.bfloat16).to(torch.cuda.current_device())
+ reward_model.to(torch.bfloat16).to(torch.cuda.current_device())
- if args.model == 'gpt2':
+ if args.model == "gpt2":
actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
- elif args.model == 'bloom':
+ elif args.model == "bloom":
actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
- elif args.model == 'opt':
+ elif args.model == "opt":
actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
- elif args.model == 'llama':
+ elif args.model == "llama":
actor = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
else:
raise ValueError(f'Unsupported actor model "{args.model}"')
- if rm_model_name == 'gpt2':
- critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
- elif rm_model_name == 'bloom':
- critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
- elif rm_model_name == 'opt':
- critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
- elif rm_model_name == 'llama':
- critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank, use_action_mask=True)
+ if rm_model_name == "gpt2":
+ critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
+ elif rm_model_name == "bloom":
+ critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
+ elif rm_model_name == "opt":
+ critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
+ elif rm_model_name == "llama":
+ critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
else:
raise ValueError(f'Unsupported reward model "{rm_model_name}"')
@@ -94,65 +98,72 @@ def main(args):
critic.load_state_dict(state_dict, strict=False)
del state_dict
- if args.strategy != 'colossalai_gemini':
- critic.to(torch.float16).to(torch.cuda.current_device())
- actor.to(torch.float16).to(torch.cuda.current_device())
+ actor.to(torch.bfloat16).to(torch.cuda.current_device())
+ critic.to(torch.bfloat16).to(torch.cuda.current_device())
# configure optimizer
- if args.strategy.startswith('colossalai'):
- actor_optim = HybridAdam(actor.parameters(), lr=1e-7)
- critic_optim = HybridAdam(critic.parameters(), lr=1e-7)
+ if args.strategy.startswith("colossalai"):
+ actor_optim = HybridAdam(actor.parameters(), lr=args.lr)
+ critic_optim = HybridAdam(critic.parameters(), lr=args.lr)
else:
- actor_optim = Adam(actor.parameters(), lr=1e-7)
- critic_optim = Adam(critic.parameters(), lr=1e-7)
+ actor_optim = Adam(actor.parameters(), lr=args.lr)
+ critic_optim = Adam(critic.parameters(), lr=args.lr)
# configure tokenizer
- if args.model == 'gpt2':
- tokenizer = GPT2Tokenizer.from_pretrained(
- 'gpt2' if args.tokenizer is None else args.tokenizer)
+ if args.model == "gpt2":
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'bloom':
+ elif args.model == "bloom":
tokenizer = BloomTokenizerFast.from_pretrained(
- 'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer)
+ "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
+ )
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'opt':
- tokenizer = AutoTokenizer.from_pretrained(
- "facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
+ elif args.model == "opt":
+ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'llama':
+ elif args.model == "llama":
tokenizer = LlamaTokenizer.from_pretrained(
- "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer)
- tokenizer.eos_token = '<\s>'
+ "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
+ )
+ tokenizer.eos_token = ""
tokenizer.pad_token = tokenizer.unk_token
else:
raise ValueError(f'Unsupported model "{args.model}"')
-
- prompt_dataset = PromptDataset(tokenizer=tokenizer, data_path=args.prompt_dataset, max_datasets_size=16384)
+ # NOTE: generate() requires padding_side to be "left"
+ tokenizer.padding_side = "left"
+
+ prompt_dataset = PromptDataset(
+ tokenizer=tokenizer,
+ data_path=args.prompt_dataset,
+ max_datasets_size=args.max_datasets_size,
+ max_length=args.max_input_len,
+ )
if dist.is_initialized() and dist.get_world_size() > 1:
prompt_sampler = DistributedSampler(prompt_dataset, shuffle=True, seed=42, drop_last=True)
else:
prompt_sampler = None
- prompt_dataloader = DataLoader(prompt_dataset,
- shuffle=(prompt_sampler is None),
- sampler=prompt_sampler,
- batch_size=args.experience_batch_size)
-
- pretrain_dataset = SupervisedDataset(tokenizer=tokenizer,
- data_path=args.pretrain_dataset,
- max_datasets_size=16384,
- max_length=args.max_input_len)
+ prompt_dataloader = DataLoader(
+ prompt_dataset, shuffle=(prompt_sampler is None), sampler=prompt_sampler, batch_size=args.experience_batch_size
+ )
+
+ pretrain_dataset = SupervisedDataset(
+ tokenizer=tokenizer,
+ data_path=args.pretrain_dataset,
+ max_datasets_size=args.max_datasets_size,
+ max_length=args.max_input_len,
+ )
if dist.is_initialized() and dist.get_world_size() > 1:
pretrain_sampler = DistributedSampler(pretrain_dataset, shuffle=True, seed=42, drop_last=True)
else:
pretrain_sampler = None
- pretrain_dataloader = DataLoader(pretrain_dataset,
- shuffle=(pretrain_sampler is None),
- sampler=pretrain_sampler,
- batch_size=args.ptx_batch_size)
+ pretrain_dataloader = DataLoader(
+ pretrain_dataset, shuffle=(pretrain_sampler is None), sampler=pretrain_sampler, batch_size=args.ptx_batch_size
+ )
# NOTE: For small models like opt-1.3b, reward model and initial model are not required to be parallelized.
- (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = \
- strategy.prepare((actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
+ (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
+ (actor, actor_optim), (critic, critic_optim), reward_model, initial_model
+ )
# configure trainer
trainer = PPOTrainer(
@@ -163,6 +174,7 @@ def main(args):
initial_model,
actor_optim,
critic_optim,
+ tokenizer=tokenizer,
kl_coef=args.kl_coef,
ptx_coef=args.ptx_coef,
train_batch_size=args.train_batch_size,
@@ -171,52 +183,67 @@ def main(args):
do_sample=True,
temperature=1.0,
top_k=50,
- pad_token_id=tokenizer.pad_token_id,
- eos_token_id=tokenizer.eos_token_id,
- offload_inference_models=args.strategy != 'colossalai_gemini'
+ offload_inference_models=args.strategy != "colossalai_gemini",
)
- trainer.fit(prompt_dataloader=prompt_dataloader,
- pretrain_dataloader=pretrain_dataloader,
- num_episodes=args.num_episodes,
- num_collect_steps=args.num_collect_steps,
- num_update_steps=args.num_update_steps)
+ trainer.fit(
+ num_episodes=args.num_episodes,
+ num_collect_steps=args.num_collect_steps,
+ num_update_steps=args.num_update_steps,
+ prompt_dataloader=prompt_dataloader,
+ pretrain_dataloader=pretrain_dataloader,
+ log_dir=args.log_dir,
+ use_wandb=args.use_wandb,
+ )
+ if args.lora_rank > 0 and args.merge_lora_weights:
+ from coati.models.lora import LORA_MANAGER
+
+ # NOTE: set model to eval to merge LoRA weights
+ LORA_MANAGER.merge_weights = True
+ actor.eval()
# save model checkpoint after fitting
- strategy.save_model(actor, args.save_path, only_rank0=True)
+ strategy.save_pretrained(actor, path=args.save_path)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
- strategy.save_optimizer(actor_optim,
- 'actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
- only_rank0=False)
+ strategy.save_optimizer(
+ actor_optim, "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()), only_rank0=False
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--prompt_dataset', type=str, default=None, help='path to the prompt dataset')
- parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset')
- parser.add_argument('--strategy',
- choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
- default='colossalai_zero2',
- help='strategy to use')
- parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--tokenizer', type=str, default=None)
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--rm_model', default=None, choices=['gpt2', 'bloom', 'opt', 'llama'])
- parser.add_argument('--rm_path', type=str, default=None)
- parser.add_argument('--rm_pretrain', type=str, default=None)
- parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts')
- parser.add_argument('--need_optim_ckpt', type=bool, default=False)
- parser.add_argument('--num_episodes', type=int, default=10)
- parser.add_argument('--num_collect_steps', type=int, default=10)
- parser.add_argument('--num_update_steps', type=int, default=5)
- parser.add_argument('--train_batch_size', type=int, default=8)
- parser.add_argument('--ptx_batch_size', type=int, default=1)
- parser.add_argument('--experience_batch_size', type=int, default=8)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
- parser.add_argument('--kl_coef', type=float, default=0.1)
- parser.add_argument('--ptx_coef', type=float, default=0.9)
- parser.add_argument('--max_input_len', type=int, default=96)
- parser.add_argument('--max_seq_len', type=int, default=128)
+ parser.add_argument("--prompt_dataset", type=str, default=None, help="path to the prompt dataset")
+ parser.add_argument("--pretrain_dataset", type=str, default=None, help="path to the pretrained dataset")
+ parser.add_argument("--max_datasets_size", type=int, default=50000)
+ parser.add_argument(
+ "--strategy",
+ choices=["ddp", "colossalai_gemini", "colossalai_zero2"],
+ default="colossalai_zero2",
+ help="strategy to use",
+ )
+ parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--tokenizer", type=str, default=None)
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--rm_model", default=None, choices=["gpt2", "bloom", "opt", "llama"])
+ parser.add_argument("--rm_path", type=str, default=None)
+ parser.add_argument("--rm_pretrain", type=str, default=None)
+ parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
+ parser.add_argument("--need_optim_ckpt", type=bool, default=False)
+ parser.add_argument("--num_episodes", type=int, default=10)
+ parser.add_argument("--num_collect_steps", type=int, default=10)
+ parser.add_argument("--num_update_steps", type=int, default=5)
+ parser.add_argument("--train_batch_size", type=int, default=8)
+ parser.add_argument("--ptx_batch_size", type=int, default=1)
+ parser.add_argument("--experience_batch_size", type=int, default=8)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+ parser.add_argument("--merge_lora_weights", type=bool, default=True)
+ parser.add_argument("--lr", type=float, default=1e-7)
+ parser.add_argument("--kl_coef", type=float, default=0.1)
+ parser.add_argument("--ptx_coef", type=float, default=0.9)
+ parser.add_argument("--max_input_len", type=int, default=96)
+ parser.add_argument("--max_seq_len", type=int, default=128)
+ parser.add_argument("--log_dir", default="logs", type=str)
+ parser.add_argument("--use_wandb", default=False, action="store_true")
args = parser.parse_args()
main(args)
diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
index 190460bc20f6..fcdd29b2954b 100644
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@@ -1,5 +1,5 @@
import argparse
-from random import randint
+import warnings
import torch
import torch.distributed as dist
@@ -24,65 +24,69 @@
def train(args):
# configure strategy
- if args.strategy == 'ddp':
+ if args.strategy == "ddp":
strategy = DDPStrategy()
- elif args.strategy == 'colossalai_gemini':
- strategy = GeminiStrategy(placement_policy='cuda')
- elif args.strategy == 'colossalai_zero2':
- strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
+ elif args.strategy == "colossalai_gemini":
+ strategy = GeminiStrategy(placement_policy="auto")
+ elif args.strategy == "colossalai_zero2":
+ strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
# configure model
+ if args.lora_rank > 0:
+ warnings.warn("Lora is not supported yet.")
+ args.lora_rank = 0
+
with strategy.model_init_context():
- if args.model == 'bloom':
+ if args.model == "bloom":
model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
- elif args.model == 'opt':
+ elif args.model == "opt":
model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
- elif args.model == 'gpt2':
+ elif args.model == "gpt2":
model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
- elif args.model == 'llama':
+ elif args.model == "llama":
model = LlamaRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
else:
raise ValueError(f'Unsupported model "{args.model}"')
- model.to(torch.float16).to(torch.cuda.current_device())
+ model.to(torch.bfloat16).to(torch.cuda.current_device())
if args.model_path is not None:
state_dict = torch.load(args.model_path)
model.load_state_dict(state_dict)
# configure tokenizer
- if args.model == 'gpt2':
- tokenizer = GPT2Tokenizer.from_pretrained(
- 'gpt2' if args.tokenizer is None else args.tokenizer)
+ if args.model == "gpt2":
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'bloom':
+ elif args.model == "bloom":
tokenizer = BloomTokenizerFast.from_pretrained(
- 'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer)
+ "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
+ )
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'opt':
- tokenizer = AutoTokenizer.from_pretrained(
- "facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
+ elif args.model == "opt":
+ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'llama':
+ elif args.model == "llama":
tokenizer = LlamaTokenizer.from_pretrained(
- "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer)
- tokenizer.eos_token = '<\s>'
+ "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
+ )
+ tokenizer.eos_token = ""
tokenizer.pad_token = tokenizer.unk_token
else:
raise ValueError(f'Unsupported model "{args.model}"')
# configure optimizer
- if args.strategy.startswith('colossalai'):
- optim = HybridAdam(model.parameters(), lr=5e-6)
+ if args.strategy.startswith("colossalai"):
+ optim = HybridAdam(model.parameters(), lr=args.lr)
else:
- optim = Adam(model.parameters(), lr=5e-6)
+ optim = Adam(model.parameters(), lr=args.lr)
# configure loss function
- if args.loss_fn == 'log_sig':
+ if args.loss_fn == "log_sig":
loss_fn = LogSigLoss()
- elif args.loss_fn == 'log_exp':
+ elif args.loss_fn == "log_exp":
loss_fn = LogExpLoss()
else:
raise ValueError(f'Unsupported loss function "{args.loss_fn}"')
@@ -93,110 +97,112 @@ def train(args):
else:
data = load_dataset(args.dataset)
- if args.test:
- train_data = data['train'].select(range(20))
- eval_data = data['test'].select(range(5))
- else:
- train_data = data['train']
- eval_data = data['test']
- valid_data = data['test'].select((randint(0, len(eval_data) - 1) for _ in range(len(eval_data) // 5)))
+ train_data = data["train"].select(range(min(args.max_datasets_size, len(data["train"]))))
+ eval_data = data["test"].select(range(min(args.max_datasets_size, len(data["test"]))))
- if args.dataset == 'Dahoas/rm-static':
+ if args.dataset == "Dahoas/rm-static":
train_dataset = RmStaticDataset(train_data, tokenizer, args.max_len)
- valid_dataset = RmStaticDataset(valid_data, tokenizer, args.max_len)
eval_dataset = RmStaticDataset(eval_data, tokenizer, args.max_len)
- elif args.dataset == 'Anthropic/hh-rlhf':
+ elif args.dataset == "Anthropic/hh-rlhf":
train_dataset = HhRlhfDataset(train_data, tokenizer, args.max_len)
- valid_dataset = HhRlhfDataset(valid_data, tokenizer, args.max_len)
eval_dataset = HhRlhfDataset(eval_data, tokenizer, args.max_len)
else:
raise ValueError(f'Unsupported dataset "{args.dataset}"')
if dist.is_initialized() and dist.get_world_size() > 1:
- train_sampler = DistributedSampler(train_dataset,
- shuffle=True,
- seed=42,
- drop_last=True,
- rank=dist.get_rank(),
- num_replicas=dist.get_world_size())
- valid_sampler = DistributedSampler(valid_dataset,
- shuffle=True,
- seed=42,
- drop_last=True,
- rank=dist.get_rank(),
- num_replicas=dist.get_world_size())
- eval_sampler = DistributedSampler(eval_dataset,
- shuffle=True,
- seed=42,
- drop_last=True,
- rank=dist.get_rank(),
- num_replicas=dist.get_world_size())
+ train_sampler = DistributedSampler(
+ train_dataset,
+ shuffle=True,
+ seed=42,
+ drop_last=True,
+ rank=dist.get_rank(),
+ num_replicas=dist.get_world_size(),
+ )
+ eval_sampler = DistributedSampler(
+ eval_dataset,
+ shuffle=True,
+ seed=42,
+ drop_last=True,
+ rank=dist.get_rank(),
+ num_replicas=dist.get_world_size(),
+ )
else:
train_sampler = None
- valid_sampler = None
eval_sampler = None
- train_dataloader = DataLoader(train_dataset,
- shuffle=(train_sampler is None),
- sampler=train_sampler,
- batch_size=args.batch_size,
- pin_memory=True)
-
- valid_dataloader = DataLoader(valid_dataset,
- shuffle=(valid_sampler is None),
- sampler=valid_sampler,
- batch_size=args.batch_size,
- pin_memory=True)
+ train_dataloader = DataLoader(
+ train_dataset,
+ shuffle=(train_sampler is None),
+ sampler=train_sampler,
+ batch_size=args.batch_size,
+ pin_memory=True,
+ )
- eval_dataloader = DataLoader(eval_dataset,
- shuffle=(eval_sampler is None),
- sampler=eval_sampler,
- batch_size=args.batch_size,
- pin_memory=True)
+ eval_dataloader = DataLoader(
+ eval_dataset, shuffle=(eval_sampler is None), sampler=eval_sampler, batch_size=args.batch_size, pin_memory=True
+ )
lr_scheduler = CosineAnnealingLR(optim, train_dataloader.__len__() // 100)
strategy_dict = strategy.prepare(dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler))
- model = strategy_dict['model']
- optim = strategy_dict['optimizer']
- lr_scheduler = strategy_dict['lr_scheduler']
- trainer = RewardModelTrainer(model=model,
- strategy=strategy,
- optim=optim,
- lr_scheduler=lr_scheduler,
- loss_fn=loss_fn,
- max_epochs=args.max_epochs)
-
- trainer.fit(train_dataloader=train_dataloader, valid_dataloader=valid_dataloader, eval_dataloader=eval_dataloader)
+ model = strategy_dict["model"]
+ optim = strategy_dict["optimizer"]
+ lr_scheduler = strategy_dict["lr_scheduler"]
+ trainer = RewardModelTrainer(
+ model=model,
+ strategy=strategy,
+ optim=optim,
+ lr_scheduler=lr_scheduler,
+ loss_fn=loss_fn,
+ max_epochs=args.max_epochs,
+ )
+
+ trainer.fit(
+ train_dataloader=train_dataloader,
+ eval_dataloader=eval_dataloader,
+ log_dir=args.log_dir,
+ use_wandb=args.use_wandb,
+ )
+
+ if args.lora_rank > 0 and args.merge_lora_weights:
+ from coati.models.lora import LORA_MANAGER
+
+ # NOTE: set model to eval to merge LoRA weights
+ LORA_MANAGER.merge_weights = True
+ model.eval()
# save model checkpoint after fitting on only rank0
- strategy.save_model(model, args.save_path, only_rank0=True)
+ state_dict = model.state_dict()
+ torch.save(state_dict, args.save_path)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
- strategy.save_optimizer(trainer.optimizer,
- 'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()),
- only_rank0=False)
+ strategy.save_optimizer(
+ trainer.optimizer, "rm_optim_checkpoint_%d.pt" % (torch.cuda.current_device()), only_rank0=False
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--strategy',
- choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'],
- default='colossalai_zero2')
- parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
- parser.add_argument('--tokenizer', type=str, default=None)
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--model_path', type=str, default=None)
- parser.add_argument('--need_optim_ckpt', type=bool, default=False)
- parser.add_argument('--dataset',
- type=str,
- choices=['Anthropic/hh-rlhf', 'Dahoas/rm-static'],
- default='Dahoas/rm-static')
- parser.add_argument('--subset', type=lambda x: None if x == 'None' else x, default=None)
- parser.add_argument('--save_path', type=str, default='rm_ckpt')
- parser.add_argument('--max_epochs', type=int, default=1)
- parser.add_argument('--batch_size', type=int, default=1)
- parser.add_argument('--max_len', type=int, default=512)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
- parser.add_argument('--loss_fn', type=str, default='log_sig', choices=['log_sig', 'log_exp'])
- parser.add_argument('--test', type=bool, default=False)
+ parser.add_argument(
+ "--strategy", choices=["ddp", "colossalai_gemini", "colossalai_zero2"], default="colossalai_zero2"
+ )
+ parser.add_argument("--model", choices=["gpt2", "bloom", "opt", "llama"], default="bloom")
+ parser.add_argument("--tokenizer", type=str, default=None)
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--model_path", type=str, default=None)
+ parser.add_argument("--need_optim_ckpt", type=bool, default=False)
+ parser.add_argument(
+ "--dataset", type=str, choices=["Anthropic/hh-rlhf", "Dahoas/rm-static"], default="Dahoas/rm-static"
+ )
+ parser.add_argument("--subset", type=lambda x: None if x == "None" else x, default=None)
+ parser.add_argument("--max_datasets_size", type=int, default=1000000)
+ parser.add_argument("--save_path", type=str, default="rm_ckpt")
+ parser.add_argument("--max_epochs", type=int, default=1)
+ parser.add_argument("--batch_size", type=int, default=1)
+ parser.add_argument("--max_len", type=int, default=512)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+ parser.add_argument("--merge_lora_weights", type=bool, default=True)
+ parser.add_argument("--lr", type=float, default=9e-6)
+ parser.add_argument("--loss_fn", type=str, default="log_sig", choices=["log_sig", "log_exp"])
+ parser.add_argument("--log_dir", default="logs", type=str)
+ parser.add_argument("--use_wandb", default=False, action="store_true")
args = parser.parse_args()
train(args)
diff --git a/applications/Chat/examples/train_rm.sh b/applications/Chat/examples/train_rm.sh
index cc1b7be2815f..c5ebaf708ddc 100755
--- a/applications/Chat/examples/train_rm.sh
+++ b/applications/Chat/examples/train_rm.sh
@@ -16,7 +16,10 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
set_n_least_used_CUDA_VISIBLE_DEVICES 2
torchrun --standalone --nproc_per_node=2 train_reward_model.py \
- --model 'bloom' \
+ --pretrain 'gpt2' \
+ --model 'gpt2' \
--strategy colossalai_zero2 \
- --loss_fn 'log_sig' \
- --dataset 'Anthropic/hh-rlhf'
+ --loss_fn 'log_exp' \
+ --dataset 'Anthropic/hh-rlhf' \
+ --batch_size 16 \
+ --max_epochs 10
diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
index f068ea2bf5de..d00c04809a2d 100644
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@@ -6,206 +6,216 @@
import torch.distributed as dist
from coati.dataset import SFTDataset, SupervisedDataset
from coati.models.bloom import BLOOMActor
+from coati.models.chatglm import ChatGLMActor
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
from coati.models.gpt import GPTActor
from coati.models.llama import LlamaActor
from coati.models.opt import OPTActor
-from coati.models.chatglm import ChatGLMActor
from coati.trainer import SFTTrainer
from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
from datasets import load_dataset
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer, AutoModel
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from transformers.trainer import get_scheduler
from colossalai.logging import get_dist_logger
from colossalai.nn.optimizer import HybridAdam
-from colossalai.tensor import ColoParameter
def train(args):
# configure strategy
- if args.strategy == 'ddp':
+ if args.strategy == "ddp":
strategy = DDPStrategy()
- elif args.strategy == 'colossalai_gemini':
- strategy = GeminiStrategy(placement_policy='cuda')
- elif args.strategy == 'colossalai_zero2':
- strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda')
- elif args.strategy == 'colossalai_zero2_cpu':
- strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu')
+ elif args.strategy == "colossalai_gemini":
+ strategy = GeminiStrategy(placement_policy="auto")
+ elif args.strategy == "colossalai_zero2":
+ strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
+ elif args.strategy == "colossalai_zero2_cpu":
+ strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
# configure model
if args.lora_rank > 0:
- warnings.warn("Gradient checkpoint is disabled when using LoRA")
- args.grad_checkpoint = False
+ warnings.warn("Lora is not supported yet.")
+ args.lora_rank = 0
+
with strategy.model_init_context():
- if args.model == 'bloom':
- model = BLOOMActor(pretrained=args.pretrain,
- lora_rank=args.lora_rank,
- checkpoint=args.grad_checkpoint)
- elif args.model == 'opt':
- model = OPTActor(pretrained=args.pretrain,
- lora_rank=args.lora_rank,
- checkpoint=args.grad_checkpoint)
- elif args.model == 'gpt2':
- model = GPTActor(pretrained=args.pretrain,
- lora_rank=args.lora_rank,
- checkpoint=args.grad_checkpoint)
- elif args.model == 'llama':
- model = LlamaActor(pretrained=args.pretrain,
- lora_rank=args.lora_rank,
- checkpoint=args.grad_checkpoint)
- elif args.model == 'chatglm':
+ if args.model == "bloom":
+ model = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
+ elif args.model == "opt":
+ model = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
+ elif args.model == "gpt2":
+ model = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
+ elif args.model == "llama":
+ model = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
+ elif args.model == "chatglm":
model = ChatGLMActor(pretrained=args.pretrain)
else:
raise ValueError(f'Unsupported model "{args.model}"')
- model.to(torch.float16).to(torch.cuda.current_device())
+ model.to(torch.bfloat16).to(torch.cuda.current_device())
# configure tokenizer
- if args.model == 'gpt2':
- tokenizer = GPT2Tokenizer.from_pretrained(
- 'gpt2' if args.tokenizer is None else args.tokenizer)
+ if args.model == "gpt2":
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'bloom':
+ elif args.model == "bloom":
tokenizer = BloomTokenizerFast.from_pretrained(
- 'bigscience/bloom-560m' if args.tokenizer is None else args.tokenizer)
+ "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
+ )
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'opt':
- tokenizer = AutoTokenizer.from_pretrained(
- "facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
+ elif args.model == "opt":
+ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
tokenizer.pad_token = tokenizer.eos_token
- elif args.model == 'llama':
+ elif args.model == "llama":
tokenizer = LlamaTokenizer.from_pretrained(
- "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer)
- tokenizer.eos_token = '<\s>'
+ "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
+ )
+ tokenizer.eos_token = ""
tokenizer.pad_token = tokenizer.unk_token
- elif args.model == 'chatglm':
+ elif args.model == "chatglm":
tokenizer = ChatGLMTokenizer.from_pretrained(
- "THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer, trust_remote_code=True)
+ "THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer, trust_remote_code=True
+ )
else:
raise ValueError(f'Unsupported model "{args.model}"')
- if args.model == 'llama' and args.strategy == 'colossalai_gemini':
- # this is a hack to deal with the resized embedding
- # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
- for name, param in model.named_parameters():
- if not isinstance(param, ColoParameter):
- sub_module_name = '.'.join(name.split('.')[:-1])
- weight_name = name.split('.')[-1]
- sub_module = model.get_submodule(sub_module_name)
- setattr(sub_module, weight_name, ColoParameter(param))
-
# configure optimizer
- if args.strategy.startswith('colossalai'):
+ if args.strategy.startswith("colossalai"):
optim = HybridAdam(model.parameters(), lr=args.lr, clipping_norm=1.0)
else:
optim = Adam(model.parameters(), lr=args.lr)
- logger = get_dist_logger()
# configure dataset
- if args.dataset == 'yizhongw/self_instruct':
- train_data = load_dataset(args.dataset, 'super_natural_instructions', split='train')
- eval_data = load_dataset(args.dataset, 'super_natural_instructions', split='test')
+ if args.dataset == "yizhongw/self_instruct":
+ train_data = load_dataset(args.dataset, "super_natural_instructions", split="train")
+ eval_data = load_dataset(args.dataset, "super_natural_instructions", split="test")
+
+ if args.max_datasets_size is not None:
+ train_data = train_data.select(range(min(args.max_datasets_size, len(train_data))))
+ eval_data = eval_data.select(range(min(args.max_datasets_size, len(eval_data))))
train_dataset = SFTDataset(train_data, tokenizer, args.max_len)
eval_dataset = SFTDataset(eval_data, tokenizer, args.max_len)
else:
- train_dataset = SupervisedDataset(tokenizer=tokenizer,
- data_path=args.dataset,
- max_datasets_size=args.max_datasets_size,
- max_length=args.max_len)
+ train_dataset = SupervisedDataset(
+ tokenizer=tokenizer,
+ data_path=args.dataset,
+ max_datasets_size=args.max_datasets_size,
+ max_length=args.max_len,
+ )
eval_dataset = None
if dist.is_initialized() and dist.get_world_size() > 1:
- train_sampler = DistributedSampler(train_dataset,
- shuffle=True,
- seed=42,
- drop_last=True,
- rank=dist.get_rank(),
- num_replicas=dist.get_world_size())
+ train_sampler = DistributedSampler(
+ train_dataset,
+ shuffle=True,
+ seed=42,
+ drop_last=True,
+ rank=dist.get_rank(),
+ num_replicas=dist.get_world_size(),
+ )
if eval_dataset is not None:
- eval_sampler = DistributedSampler(eval_dataset,
- shuffle=False,
- seed=42,
- drop_last=False,
- rank=dist.get_rank(),
- num_replicas=dist.get_world_size())
+ eval_sampler = DistributedSampler(
+ eval_dataset,
+ shuffle=False,
+ seed=42,
+ drop_last=False,
+ rank=dist.get_rank(),
+ num_replicas=dist.get_world_size(),
+ )
else:
train_sampler = None
eval_sampler = None
- train_dataloader = DataLoader(train_dataset,
- shuffle=(train_sampler is None),
- sampler=train_sampler,
- batch_size=args.batch_size,
- pin_memory=True)
+ train_dataloader = DataLoader(
+ train_dataset,
+ shuffle=(train_sampler is None),
+ sampler=train_sampler,
+ batch_size=args.batch_size,
+ pin_memory=True,
+ )
if eval_dataset is not None:
- eval_dataloader = DataLoader(eval_dataset,
- shuffle=(eval_sampler is None),
- sampler=eval_sampler,
- batch_size=args.batch_size,
- pin_memory=True)
+ eval_dataloader = DataLoader(
+ eval_dataset,
+ shuffle=(eval_sampler is None),
+ sampler=eval_sampler,
+ batch_size=args.batch_size,
+ pin_memory=True,
+ )
else:
eval_dataloader = None
num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
max_steps = math.ceil(args.max_epochs * num_update_steps_per_epoch)
- lr_scheduler = get_scheduler("cosine",
- optim,
- num_warmup_steps=math.ceil(max_steps * 0.03),
- num_training_steps=max_steps)
+ lr_scheduler = get_scheduler(
+ "cosine", optim, num_warmup_steps=math.ceil(max_steps * 0.03), num_training_steps=max_steps
+ )
strategy_dict = strategy.prepare(dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler))
- model = strategy_dict['model']
- optim = strategy_dict['optimizer']
- lr_scheduler = strategy_dict['lr_scheduler']
- trainer = SFTTrainer(model=model,
- strategy=strategy,
- optim=optim,
- lr_scheduler=lr_scheduler,
- max_epochs=args.max_epochs,
- accumulation_steps=args.accumulation_steps)
-
- trainer.fit(train_dataloader=train_dataloader,
- eval_dataloader=eval_dataloader,
- logger=logger,
- use_wandb=args.use_wandb)
+ model = strategy_dict["model"]
+ optim = strategy_dict["optimizer"]
+ lr_scheduler = strategy_dict["lr_scheduler"]
+ trainer = SFTTrainer(
+ model=model,
+ strategy=strategy,
+ optim=optim,
+ lr_scheduler=lr_scheduler,
+ max_epochs=args.max_epochs,
+ accumulation_steps=args.accumulation_steps,
+ )
+ logger = get_dist_logger()
+ trainer.fit(
+ train_dataloader=train_dataloader,
+ eval_dataloader=eval_dataloader,
+ logger=logger,
+ log_dir=args.log_dir,
+ use_wandb=args.use_wandb,
+ )
+
+ if args.lora_rank > 0 and args.merge_lora_weights:
+ from coati.models.lora import LORA_MANAGER
+
+ # NOTE: set model to eval to merge LoRA weights
+ LORA_MANAGER.merge_weights = True
+ model.eval()
# save model checkpoint after fitting on only rank0
- strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer)
+ strategy.save_pretrained(model, path=args.save_path, tokenizer=tokenizer)
# save optimizer checkpoint on all ranks
if args.need_optim_ckpt:
- strategy.save_optimizer(trainer.optimizer,
- 'rm_optim_checkpoint_%d.pt' % (torch.cuda.current_device()),
- only_rank0=False)
+ strategy.save_optimizer(
+ trainer.optimizer, "rm_optim_checkpoint_%d.pt" % (torch.cuda.current_device()), only_rank0=False
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--strategy',
- choices=['ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_zero2_cpu'],
- default='colossalai_zero2')
- parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama', 'chatglm'], default='bloom')
- parser.add_argument('--tokenizer', type=str, default=None)
- parser.add_argument('--pretrain', type=str, default=None)
- parser.add_argument('--dataset', type=str, default=None)
- parser.add_argument('--max_datasets_size', type=int, default=None)
- parser.add_argument('--save_path', type=str, default='output')
- parser.add_argument('--need_optim_ckpt', type=bool, default=False)
- parser.add_argument('--max_epochs', type=int, default=3)
- parser.add_argument('--batch_size', type=int, default=4)
- parser.add_argument('--max_len', type=int, default=512)
- parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
- parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
- parser.add_argument('--lr', type=float, default=5e-6)
- parser.add_argument('--accumulation_steps', type=int, default=8)
- parser.add_argument('--use_wandb', default=False, action='store_true')
- parser.add_argument('--grad_checkpoint', default=False, action='store_true')
+ parser.add_argument(
+ "--strategy",
+ choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_zero2_cpu"],
+ default="colossalai_zero2",
+ )
+ parser.add_argument("--model", choices=["gpt2", "bloom", "opt", "llama", "chatglm"], default="bloom")
+ parser.add_argument("--tokenizer", type=str, default=None)
+ parser.add_argument("--pretrain", type=str, default=None)
+ parser.add_argument("--dataset", type=str, default=None)
+ parser.add_argument("--max_datasets_size", type=int, default=None)
+ parser.add_argument("--save_path", type=str, default="output")
+ parser.add_argument("--need_optim_ckpt", type=bool, default=False)
+ parser.add_argument("--max_epochs", type=int, default=3)
+ parser.add_argument("--batch_size", type=int, default=4)
+ parser.add_argument("--max_len", type=int, default=512)
+ parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
+ parser.add_argument("--merge_lora_weights", type=bool, default=True)
+ parser.add_argument("--lr", type=float, default=5e-6)
+ parser.add_argument("--accumulation_steps", type=int, default=8)
+ parser.add_argument("--log_dir", default="logs", type=str)
+ parser.add_argument("--use_wandb", default=False, action="store_true")
+ parser.add_argument("--grad_checkpoint", default=False, action="store_true")
args = parser.parse_args()
train(args)
diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
index 1a5cd069011d..0fb4da3d3ce8 100755
--- a/applications/Chat/examples/train_sft.sh
+++ b/applications/Chat/examples/train_sft.sh
@@ -19,7 +19,6 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--pretrain "/path/to/LLaMa-7B/" \
--model 'llama' \
--strategy colossalai_zero2 \
- --log_interval 10 \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 4 \
diff --git a/applications/Chat/inference/benchmark.py b/applications/Chat/inference/benchmark.py
index 438a1e3ef1c7..dbb5490a63dc 100644
--- a/applications/Chat/inference/benchmark.py
+++ b/applications/Chat/inference/benchmark.py
@@ -84,28 +84,34 @@ def evaluate(
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
- 'pretrained',
- help='Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.')
- parser.add_argument('--quant',
- choices=['8bit', '4bit'],
- default=None,
- help='Quantization mode. Default: None (no quantization, fp16).')
+ "pretrained",
+ help="Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.",
+ )
+ parser.add_argument(
+ "--quant",
+ choices=["8bit", "4bit"],
+ default=None,
+ help="Quantization mode. Default: None (no quantization, fp16).",
+ )
parser.add_argument(
- '--gptq_checkpoint',
+ "--gptq_checkpoint",
default=None,
- help='Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.')
- parser.add_argument('--gptq_group_size',
- type=int,
- default=128,
- help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.')
+ help="Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.",
+ )
+ parser.add_argument(
+ "--gptq_group_size",
+ type=int,
+ default=128,
+ help="Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.",
+ )
args = parser.parse_args()
- if args.quant == '4bit':
- assert args.gptq_checkpoint is not None, 'Please specify a GPTQ checkpoint.'
+ if args.quant == "4bit":
+ assert args.gptq_checkpoint is not None, "Please specify a GPTQ checkpoint."
tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
- if args.quant == '4bit':
+ if args.quant == "4bit":
with low_resource_init():
config = LlamaConfig.from_pretrained(args.pretrained)
model = LlamaForCausalLM(config)
@@ -114,12 +120,12 @@ def evaluate(
else:
model = LlamaForCausalLM.from_pretrained(
args.pretrained,
- load_in_8bit=(args.quant == '8bit'),
+ load_in_8bit=(args.quant == "8bit"),
torch_dtype=torch.float16,
device_map="auto",
)
- if args.quant != '8bit':
- model.half() # seems to fix bugs for some users.
+ if args.quant != "8bit":
+ model.half() # seems to fix bugs for some users.
model.eval()
total_tokens = 0
@@ -129,7 +135,7 @@ def evaluate(
resp, tokens = evaluate(model, tokenizer, instruction, temperature=0.2, num_beams=1)
total_tokens += tokens
print(f"Response: {resp}")
- print('\n----------------------------\n')
+ print("\n----------------------------\n")
duration = time() - start
- print(f'Total time: {duration:.3f} s, {total_tokens/duration:.3f} tokens/s')
- print(f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB')
+ print(f"Total time: {duration:.3f} s, {total_tokens/duration:.3f} tokens/s")
+ print(f"Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
diff --git a/applications/Chat/inference/locustfile.py b/applications/Chat/inference/locustfile.py
index 9443d4b99180..333262e538ac 100644
--- a/applications/Chat/inference/locustfile.py
+++ b/applications/Chat/inference/locustfile.py
@@ -1,26 +1,26 @@
-from json import JSONDecodeError
-
from locust import HttpUser, task
-samples = [[
- dict(
- instruction='Who is the best player in the history of NBA?',
- response='The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
- ),
- dict(instruction='continue this talk', response=''),
-], [
- dict(instruction='Who is the best player in the history of NBA?', response=''),
-]]
+samples = [
+ [
+ dict(
+ instruction="Who is the best player in the history of NBA?",
+ response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
+ ),
+ dict(instruction="continue this talk", response=""),
+ ],
+ [
+ dict(instruction="Who is the best player in the history of NBA?", response=""),
+ ],
+]
class GenerationUser(HttpUser):
-
@task
def generate(self):
for sample in samples:
- data = {'max_new_tokens': 64, 'history': sample}
- with self.client.post('/generate', json=data, catch_response=True) as response:
+ data = {"max_new_tokens": 64, "history": sample}
+ with self.client.post("/generate", json=data, catch_response=True) as response:
if response.status_code in (200, 406):
response.success()
else:
- response.failure('Response wrong')
+ response.failure("Response wrong")
diff --git a/applications/Chat/inference/server.py b/applications/Chat/inference/server.py
index 9d6b7fabef54..7c6a61b9e7f2 100644
--- a/applications/Chat/inference/server.py
+++ b/applications/Chat/inference/server.py
@@ -16,7 +16,7 @@
from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
from utils import ChatPromptProcessor, Dialogue, LockedIterator, load_json, sample_streamingly, update_model_kwargs_fn
-CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.'
+CONTEXT = "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions."
MAX_LEN = 512
running_lock = Lock()
@@ -36,11 +36,11 @@ class GenerationTaskReq(BaseModel):
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
# set CORS
-origin_spec_from_env = os.environ.get('CORS_ORIGIN', None)
+origin_spec_from_env = os.environ.get("CORS_ORIGIN", None)
if origin_spec_from_env is not None:
# allow CORS from the specified origins
- origins = os.environ['CORS_ORIGIN'].split(',')
+ origins = os.environ["CORS_ORIGIN"].split(",")
else:
# allow CORS from all origins
origins = ["*"]
@@ -58,13 +58,13 @@ def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature):
inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
# TODO(ver217): streaming generation does not support repetition_penalty now
model_kwargs = {
- 'max_generate_tokens': max_new_tokens,
- 'early_stopping': True,
- 'top_k': top_k,
- 'top_p': top_p,
- 'temperature': temperature,
- 'prepare_inputs_fn': model.prepare_inputs_for_generation,
- 'update_model_kwargs_fn': update_model_kwargs_fn,
+ "max_generate_tokens": max_new_tokens,
+ "early_stopping": True,
+ "top_k": top_k,
+ "top_p": top_p,
+ "temperature": temperature,
+ "prepare_inputs_fn": model.prepare_inputs_for_generation,
+ "update_model_kwargs_fn": update_model_kwargs_fn,
}
is_first_word = True
generator = LockedIterator(sample_streamingly(model, **inputs, **model_kwargs), running_lock)
@@ -81,9 +81,9 @@ def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature):
if is_first_word:
out_string = out_string.lstrip()
is_first_word = False
- elif current_sub_tokens[0].startswith('▁'):
+ elif current_sub_tokens[0].startswith("▁"):
# whitespace will be ignored by the frontend
- out_string = ' ' + out_string
+ out_string = " " + out_string
yield out_string
@@ -92,32 +92,33 @@ async def event_generator(request: Request, generator: Generator):
if await request.is_disconnected():
break
try:
- yield {'event': 'generate', 'data': next(generator)}
+ yield {"event": "generate", "data": next(generator)}
except StopIteration:
- yield {'event': 'end', 'data': ''}
+ yield {"event": "end", "data": ""}
break
-@app.post('/generate/stream')
-@limiter.limit('1/second')
+@app.post("/generate/stream")
+@limiter.limit("1/second")
def generate(data: GenerationTaskReq, request: Request):
prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
event_source = event_generator(
- request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature))
+ request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature)
+ )
return EventSourceResponse(event_source)
-@app.post('/generate')
-@limiter.limit('1/second')
+@app.post("/generate")
+@limiter.limit("1/second")
def generate_no_stream(data: GenerationTaskReq, request: Request):
prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
if prompt_processor.has_censored_words(prompt):
return prompt_processor.SAFE_RESPONSE
inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
with running_lock:
- output = model.generate(**inputs, **data.dict(exclude={'history'}))
+ output = model.generate(**inputs, **data.dict(exclude={"history"}))
output = output.cpu()
- prompt_len = inputs['input_ids'].size(1)
+ prompt_len = inputs["input_ids"].size(1)
response = output[0, prompt_len:]
out_string = tokenizer.decode(response, skip_special_tokens=True)
out_string = prompt_processor.postprocess_output(out_string)
@@ -126,32 +127,40 @@ def generate_no_stream(data: GenerationTaskReq, request: Request):
return out_string
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
- 'pretrained',
- help='Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.')
- parser.add_argument('--quant',
- choices=['8bit', '4bit'],
- default=None,
- help='Quantization mode. Default: None (no quantization, fp16).')
+ "pretrained",
+ help="Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.",
+ )
parser.add_argument(
- '--gptq_checkpoint',
+ "--quant",
+ choices=["8bit", "4bit"],
default=None,
- help='Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.')
- parser.add_argument('--gptq_group_size',
- type=int,
- default=128,
- help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.')
- parser.add_argument('--http_host', default='0.0.0.0')
- parser.add_argument('--http_port', type=int, default=7070)
- parser.add_argument('--profanity_file',
- default=None,
- help='Path to profanity words list. It should be a JSON file containing a list of words.')
+ help="Quantization mode. Default: None (no quantization, fp16).",
+ )
+ parser.add_argument(
+ "--gptq_checkpoint",
+ default=None,
+ help="Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.",
+ )
+ parser.add_argument(
+ "--gptq_group_size",
+ type=int,
+ default=128,
+ help="Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.",
+ )
+ parser.add_argument("--http_host", default="0.0.0.0")
+ parser.add_argument("--http_port", type=int, default=7070)
+ parser.add_argument(
+ "--profanity_file",
+ default=None,
+ help="Path to profanity words list. It should be a JSON file containing a list of words.",
+ )
args = parser.parse_args()
- if args.quant == '4bit':
- assert args.gptq_checkpoint is not None, 'Please specify a GPTQ checkpoint.'
+ if args.quant == "4bit":
+ assert args.gptq_checkpoint is not None, "Please specify a GPTQ checkpoint."
tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
@@ -161,7 +170,7 @@ def generate_no_stream(data: GenerationTaskReq, request: Request):
censored_words = []
prompt_processor = ChatPromptProcessor(tokenizer, CONTEXT, MAX_LEN, censored_words=censored_words)
- if args.quant == '4bit':
+ if args.quant == "4bit":
with low_resource_init():
config = LlamaConfig.from_pretrained(args.pretrained)
model = LlamaForCausalLM(config)
@@ -170,12 +179,12 @@ def generate_no_stream(data: GenerationTaskReq, request: Request):
else:
model = LlamaForCausalLM.from_pretrained(
args.pretrained,
- load_in_8bit=(args.quant == '8bit'),
+ load_in_8bit=(args.quant == "8bit"),
torch_dtype=torch.float16,
device_map="auto",
)
- if args.quant != '8bit':
- model.half() # seems to fix bugs for some users.
+ if args.quant != "8bit":
+ model.half() # seems to fix bugs for some users.
model.eval()
config = uvicorn.Config(app, host=args.http_host, port=args.http_port)
diff --git a/applications/Chat/inference/tests/test_chat_prompt.py b/applications/Chat/inference/tests/test_chat_prompt.py
index 23028d4959cb..9835e71894c6 100644
--- a/applications/Chat/inference/tests/test_chat_prompt.py
+++ b/applications/Chat/inference/tests/test_chat_prompt.py
@@ -3,41 +3,49 @@
from transformers import AutoTokenizer
from utils import ChatPromptProcessor, Dialogue
-CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.'
-tokenizer = AutoTokenizer.from_pretrained(os.environ['PRETRAINED_PATH'])
+CONTEXT = "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions."
+tokenizer = AutoTokenizer.from_pretrained(os.environ["PRETRAINED_PATH"])
samples = [
- ([
- Dialogue(
- instruction='Who is the best player in the history of NBA?',
- response='The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
- ),
- Dialogue(instruction='continue this talk', response=''),
- ], 128,
- 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\nThe best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1\n\n### Instruction:\ncontinue this talk\n\n### Response:\n'
+ (
+ [
+ Dialogue(
+ instruction="Who is the best player in the history of NBA?",
+ response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
+ ),
+ Dialogue(instruction="continue this talk", response=""),
+ ],
+ 128,
+ "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\nThe best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1\n\n### Instruction:\ncontinue this talk\n\n### Response:\n",
),
- ([
- Dialogue(
- instruction='Who is the best player in the history of NBA?',
- response='The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
- ),
- Dialogue(instruction='continue this talk', response=''),
- ], 200,
- 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this talk\n\n### Response:\n'
+ (
+ [
+ Dialogue(
+ instruction="Who is the best player in the history of NBA?",
+ response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
+ ),
+ Dialogue(instruction="continue this talk", response=""),
+ ],
+ 200,
+ "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this talk\n\n### Response:\n",
),
- ([
- Dialogue(
- instruction='Who is the best player in the history of NBA?',
- response='The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
- ),
- Dialogue(instruction='continue this talk', response=''),
- ], 211,
- 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this\n\n### Response:\n'
+ (
+ [
+ Dialogue(
+ instruction="Who is the best player in the history of NBA?",
+ response="The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1",
+ ),
+ Dialogue(instruction="continue this talk", response=""),
+ ],
+ 211,
+ "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this\n\n### Response:\n",
),
- ([
- Dialogue(instruction='Who is the best player in the history of NBA?', response=''),
- ], 128,
- 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\n'
+ (
+ [
+ Dialogue(instruction="Who is the best player in the history of NBA?", response=""),
+ ],
+ 128,
+ "Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\n",
),
]
@@ -49,5 +57,5 @@ def test_chat_prompt_processor():
assert prompt == result
-if __name__ == '__main__':
+if __name__ == "__main__":
test_chat_prompt_processor()
diff --git a/applications/Chat/inference/utils.py b/applications/Chat/inference/utils.py
index e8e7b05ac719..af018adf6e9d 100644
--- a/applications/Chat/inference/utils.py
+++ b/applications/Chat/inference/utils.py
@@ -20,9 +20,9 @@
from transformers.generation import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper
-def prepare_logits_processor(top_k: Optional[int] = None,
- top_p: Optional[float] = None,
- temperature: Optional[float] = None) -> LogitsProcessorList:
+def prepare_logits_processor(
+ top_k: Optional[int] = None, top_p: Optional[float] = None, temperature: Optional[float] = None
+) -> LogitsProcessorList:
processor_list = LogitsProcessorList()
if temperature is not None and temperature != 1.0:
processor_list.append(TemperatureLogitsWarper(temperature))
@@ -41,29 +41,30 @@ def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
return unfinished_sequences.max() == 0
-def sample_streamingly(model: nn.Module,
- input_ids: torch.Tensor,
- max_generate_tokens: int,
- early_stopping: bool = False,
- eos_token_id: Optional[int] = None,
- pad_token_id: Optional[int] = None,
- top_k: Optional[int] = None,
- top_p: Optional[float] = None,
- temperature: Optional[float] = None,
- prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
- update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
- **model_kwargs) -> Generator:
-
+def sample_streamingly(
+ model: nn.Module,
+ input_ids: torch.Tensor,
+ max_generate_tokens: int,
+ early_stopping: bool = False,
+ eos_token_id: Optional[int] = None,
+ pad_token_id: Optional[int] = None,
+ top_k: Optional[int] = None,
+ top_p: Optional[float] = None,
+ temperature: Optional[float] = None,
+ prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
+ update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+ **model_kwargs,
+) -> Generator:
logits_processor = prepare_logits_processor(top_k, top_p, temperature)
unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
for _ in range(max_generate_tokens):
- model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {
- 'input_ids': input_ids
- }
+ model_inputs = (
+ prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {"input_ids": input_ids}
+ )
outputs = model(**model_inputs)
- next_token_logits = outputs['logits'][:, -1, :]
+ next_token_logits = outputs["logits"][:, -1, :]
# pre-process distribution
next_token_logits = logits_processor(input_ids, next_token_logits)
# sample
@@ -107,25 +108,26 @@ def update_model_kwargs_fn(outputs: dict, **model_kwargs) -> dict:
if "attention_mask" in model_kwargs:
attention_mask = model_kwargs["attention_mask"]
model_kwargs["attention_mask"] = torch.cat(
- [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+ [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+ )
return model_kwargs
class Dialogue(BaseModel):
- instruction: str = Field(min_length=1, example='Count up from 1 to 500.')
- response: str = Field(example='')
+ instruction: str = Field(min_length=1, example="Count up from 1 to 500.")
+ response: str = Field(example="")
-def _format_dialogue(instruction: str, response: str = ''):
- return f'\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}'
+def _format_dialogue(instruction: str, response: str = ""):
+ return f"\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}"
-STOP_PAT = re.compile(r'(###|instruction:).*', flags=(re.I | re.S))
+STOP_PAT = re.compile(r"(###|instruction:).*", flags=(re.I | re.S))
class ChatPromptProcessor:
- SAFE_RESPONSE = 'The input/response contains inappropriate content, please rephrase your prompt.'
+ SAFE_RESPONSE = "The input/response contains inappropriate content, please rephrase your prompt."
def __init__(self, tokenizer, context: str, max_len: int = 2048, censored_words: List[str] = []):
self.tokenizer = tokenizer
@@ -138,42 +140,48 @@ def __init__(self, tokenizer, context: str, max_len: int = 2048, censored_words:
def preprocess_prompt(self, history: List[Dialogue], max_new_tokens: int) -> str:
if self.context_len is None:
- self.context_len = len(self.tokenizer(self.context)['input_ids'])
+ self.context_len = len(self.tokenizer(self.context)["input_ids"])
if self.dialogue_placeholder_len is None:
self.dialogue_placeholder_len = len(
- self.tokenizer(_format_dialogue(''), add_special_tokens=False)['input_ids'])
+ self.tokenizer(_format_dialogue(""), add_special_tokens=False)["input_ids"]
+ )
prompt = self.context
# the last dialogue must be in the prompt
last_dialogue = history.pop()
# the response of the last dialogue is empty
- assert last_dialogue.response == ''
- if len(self.tokenizer(_format_dialogue(last_dialogue.instruction), add_special_tokens=False)
- ['input_ids']) + max_new_tokens + self.context_len >= self.max_len:
+ assert last_dialogue.response == ""
+ if (
+ len(self.tokenizer(_format_dialogue(last_dialogue.instruction), add_special_tokens=False)["input_ids"])
+ + max_new_tokens
+ + self.context_len
+ >= self.max_len
+ ):
# to avoid truncate placeholder, apply truncate to the original instruction
- instruction_truncated = self.tokenizer(last_dialogue.instruction,
- add_special_tokens=False,
- truncation=True,
- max_length=(self.max_len - max_new_tokens - self.context_len -
- self.dialogue_placeholder_len))['input_ids']
+ instruction_truncated = self.tokenizer(
+ last_dialogue.instruction,
+ add_special_tokens=False,
+ truncation=True,
+ max_length=(self.max_len - max_new_tokens - self.context_len - self.dialogue_placeholder_len),
+ )["input_ids"]
instruction_truncated = self.tokenizer.decode(instruction_truncated).lstrip()
prompt += _format_dialogue(instruction_truncated)
return prompt
- res_len = self.max_len - max_new_tokens - len(self.tokenizer(prompt)['input_ids'])
+ res_len = self.max_len - max_new_tokens - len(self.tokenizer(prompt)["input_ids"])
rows = []
for dialogue in history[::-1]:
text = _format_dialogue(dialogue.instruction, dialogue.response)
- cur_len = len(self.tokenizer(text, add_special_tokens=False)['input_ids'])
+ cur_len = len(self.tokenizer(text, add_special_tokens=False)["input_ids"])
if res_len - cur_len < 0:
break
res_len -= cur_len
rows.insert(0, text)
- prompt += ''.join(rows) + _format_dialogue(last_dialogue.instruction)
+ prompt += "".join(rows) + _format_dialogue(last_dialogue.instruction)
return prompt
def postprocess_output(self, output: str) -> str:
- output = STOP_PAT.sub('', output)
+ output = STOP_PAT.sub("", output)
return output.strip()
def has_censored_words(self, text: str) -> bool:
@@ -184,7 +192,6 @@ def has_censored_words(self, text: str) -> bool:
class LockedIterator:
-
def __init__(self, it, lock: Lock) -> None:
self.lock = lock
self.it = iter(it)
diff --git a/applications/Chat/requirements-test.txt b/applications/Chat/requirements-test.txt
index eb1a77875acb..93d48bcb6f79 100644
--- a/applications/Chat/requirements-test.txt
+++ b/applications/Chat/requirements-test.txt
@@ -1,2 +1,2 @@
pytest
-colossalai==0.3.1
\ No newline at end of file
+colossalai==0.3.3
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index e5f5ca0932a8..e56aaca0e7cb 100644
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.20.1
tqdm
datasets
loralib
-colossalai==0.3.1
+colossalai==0.3.3
torch<2.0.0, >=1.12.1
langchain
tokenizers
@@ -11,3 +11,4 @@ sse_starlette
wandb
sentencepiece
gpustat
+tensorboard
diff --git a/applications/Chat/setup.py b/applications/Chat/setup.py
index a285a6dff4bf..eb44b6203ef8 100644
--- a/applications/Chat/setup.py
+++ b/applications/Chat/setup.py
@@ -2,40 +2,42 @@
def fetch_requirements(path):
- with open(path, 'r') as fd:
+ with open(path, "r") as fd:
return [r.strip() for r in fd.readlines()]
def fetch_readme():
- with open('README.md', encoding='utf-8') as f:
+ with open("README.md", encoding="utf-8") as f:
return f.read()
def fetch_version():
- with open('version.txt', 'r') as f:
+ with open("version.txt", "r") as f:
return f.read().strip()
setup(
- name='coati',
+ name="coati",
version=fetch_version(),
- packages=find_packages(exclude=(
- 'tests',
- 'benchmarks',
- '*.egg-info',
- )),
- description='Colossal-AI Talking Intelligence',
+ packages=find_packages(
+ exclude=(
+ "tests",
+ "benchmarks",
+ "*.egg-info",
+ )
+ ),
+ description="Colossal-AI Talking Intelligence",
long_description=fetch_readme(),
- long_description_content_type='text/markdown',
- license='Apache Software License 2.0',
- url='https://github.com/hpcaitech/Coati',
- install_requires=fetch_requirements('requirements.txt'),
- python_requires='>=3.6',
+ long_description_content_type="text/markdown",
+ license="Apache Software License 2.0",
+ url="https://github.com/hpcaitech/Coati",
+ install_requires=fetch_requirements("requirements.txt"),
+ python_requires=">=3.6",
classifiers=[
- 'Programming Language :: Python :: 3',
- 'License :: OSI Approved :: Apache Software License',
- 'Environment :: GPU :: NVIDIA CUDA',
- 'Topic :: Scientific/Engineering :: Artificial Intelligence',
- 'Topic :: System :: Distributed Computing',
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: Apache Software License",
+ "Environment :: GPU :: NVIDIA CUDA",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ "Topic :: System :: Distributed Computing",
],
)
diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py
index 3a3bf5b19cb8..9c08aa36c9b4 100644
--- a/applications/Chat/tests/test_checkpoint.py
+++ b/applications/Chat/tests/test_checkpoint.py
@@ -22,25 +22,21 @@ def get_data(batch_size: int, seq_len: int = 10) -> dict:
return dict(input_ids=input_ids, attention_mask=attention_mask)
-def train_step(strategy: Strategy,
- actor: GPTActor,
- actor_optim: HybridAdam,
- batch_size: int = 8):
+def train_step(strategy: Strategy, actor: GPTActor, actor_optim: HybridAdam, batch_size: int = 8):
data = get_data(batch_size)
action_mask = torch.ones_like(data["attention_mask"], dtype=torch.bool)
- actor_output = actor(data["input_ids"], data["attention_mask"])
- action_log_probs = calc_action_log_probs(actor_output, data["input_ids"], action_mask.size(1))
+ actor_logits = actor(data["input_ids"], data["attention_mask"])["logits"]
+ action_log_probs = calc_action_log_probs(actor_logits, data["input_ids"], action_mask.size(1))
loss = action_log_probs.sum()
strategy.backward(loss, actor, actor_optim)
strategy.optimizer_step(actor_optim)
-def run_test_checkpoint(strategy_name: str,
- shard: bool):
+def run_test_checkpoint(strategy_name: str, shard: bool):
if strategy_name == "ddp":
strategy = DDPStrategy()
elif strategy_name == "colossalai_gemini":
- strategy = GeminiStrategy(placement_policy="cuda", initial_scale=2**5)
+ strategy = GeminiStrategy(placement_policy="auto", initial_scale=2**5)
elif strategy_name == "colossalai_zero2":
strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
else:
@@ -60,12 +56,10 @@ def run_test_checkpoint(strategy_name: str,
dist.broadcast_object_list(rank0_dirname)
rank0_dirname = rank0_dirname[0]
- model_path = os.path.join(
- rank0_dirname, "model" if shard else f"model.pt")
- strategy.save_model(actor, model_path, only_rank0=not shard)
- optim_path = os.path.join(
- rank0_dirname, "optim" if shard else "optim.pt")
- strategy.save_optimizer(actor_optim, optim_path, only_rank0=not shard)
+ model_path = os.path.join(rank0_dirname, "model" if shard else f"model.pt")
+ strategy.save_model(actor, model_path)
+ optim_path = os.path.join(rank0_dirname, "optim" if shard else "optim.pt")
+ strategy.save_optimizer(actor_optim, optim_path)
dist.barrier()
strategy.load_model(actor, model_path, strict=False)
@@ -75,11 +69,7 @@ def run_test_checkpoint(strategy_name: str,
train_step(strategy, actor, actor_optim)
-def run_dist(rank: int,
- world_size: int,
- port: int,
- strategy_name: str,
- shard: bool):
+def run_dist(rank: int, world_size: int, port: int, strategy_name: str, shard: bool):
os.environ["RANK"] = str(rank)
os.environ["LOCAL_RANK"] = str(rank)
os.environ["WORLD_SIZE"] = str(world_size)
@@ -93,13 +83,8 @@ def run_dist(rank: int,
@pytest.mark.parametrize("strategy_name", ["ddp", "colossalai_gemini", "colossalai_zero2"])
@pytest.mark.parametrize("shard", [False, True])
@rerun_if_address_is_in_use()
-def test_checkpoint(world_size: int,
- strategy_name: str,
- shard: bool):
- spawn(run_dist,
- world_size,
- strategy_name=strategy_name,
- shard=shard)
+def test_checkpoint(world_size: int, strategy_name: str, shard: bool):
+ spawn(run_dist, world_size, strategy_name=strategy_name, shard=shard)
if __name__ == "__main__":
diff --git a/applications/Chat/tests/test_dataset.py b/applications/Chat/tests/test_dataset.py
index f9dee1bae935..ec61bbb13fd7 100644
--- a/applications/Chat/tests/test_dataset.py
+++ b/applications/Chat/tests/test_dataset.py
@@ -8,62 +8,40 @@
from coati.dataset.prompt_dataset import PromptDataset
from coati.dataset.reward_dataset import HhRlhfDataset, RmStaticDataset
from coati.dataset.sft_dataset import IGNORE_INDEX, SFTDataset, SupervisedDataset
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
from datasets import load_dataset
from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer, PreTrainedTokenizer
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+
SFT_DATASET = [
{
- "instruction":
- "Provide a list of the top 10 most popular mobile games in Asia",
- "input":
- "",
- "output":
- "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
- "id":
- 0
+ "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
+ "input": "",
+ "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
+ "id": 0,
},
{
- "instruction":
- "Please provide an action plan for reducing carbon footprint on a corporate level",
- "input":
- "",
- "output":
- "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
- "id":
- 1
+ "instruction": "Please provide an action plan for reducing carbon footprint on a corporate level",
+ "input": "",
+ "output": "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
+ "id": 1,
},
{
- "instruction":
- "Write a persuasive email to your boss explaining why you should have a pay raise",
- "input":
- "",
- "output":
- "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
- "id":
- 2
+ "instruction": "Write a persuasive email to your boss explaining why you should have a pay raise",
+ "input": "",
+ "output": "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
+ "id": 2,
},
]
PROMPT_DATASET = [
{
- "instruction":
- "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
- "id":
- 0
- },
- {
- "instruction": "Write a descriptive paragraph about a memorable vacation you went on",
- "id": 1
- },
- {
- "instruction": "Write a persuasive essay arguing why homework should be banned in schools",
- "id": 2
- },
- {
- "instruction": "Create a chart comparing the statistics on student debt in the United States.",
- "id": 3
+ "instruction": 'Edit this paragraph to make it more concise: "Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends."',
+ "id": 0,
},
+ {"instruction": "Write a descriptive paragraph about a memorable vacation you went on", "id": 1},
+ {"instruction": "Write a persuasive essay arguing why homework should be banned in schools", "id": 2},
+ {"instruction": "Create a chart comparing the statistics on student debt in the United States.", "id": 3},
]
@@ -120,10 +98,12 @@ def test_prompt_dataset(model: str, max_datasets_size: int, max_length: int):
json.dump(PROMPT_DATASET, f)
tokenizer = make_tokenizer(model)
assert tokenizer.padding_side in ("left", "right")
- prompt_dataset = PromptDataset(data_path=os.path.join(tmp_dir, dataset_name),
- tokenizer=tokenizer,
- max_datasets_size=max_datasets_size,
- max_length=max_length)
+ prompt_dataset = PromptDataset(
+ data_path=os.path.join(tmp_dir, dataset_name),
+ tokenizer=tokenizer,
+ max_datasets_size=max_datasets_size,
+ max_length=max_length,
+ )
assert len(prompt_dataset) == min(max_datasets_size, len(PROMPT_DATASET))
for i in range(len(prompt_dataset)):
assert isinstance(prompt_dataset[i], dict)
@@ -137,14 +117,14 @@ def test_prompt_dataset(model: str, max_datasets_size: int, max_length: int):
@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
-@pytest.mark.parametrize(["dataset_path", "subset"], [("Anthropic/hh-rlhf", "harmless-base"),
- ("Dahoas/rm-static", None)])
+@pytest.mark.parametrize(
+ ["dataset_path", "subset"], [("Anthropic/hh-rlhf", "harmless-base"), ("Dahoas/rm-static", None)]
+)
@pytest.mark.parametrize("max_datasets_size", [32])
@pytest.mark.parametrize("max_length", [32, 1024])
def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], max_datasets_size: int, max_length: int):
data = load_dataset(dataset_path, data_dir=subset)
- assert max_datasets_size <= len(data["train"]) \
- and max_datasets_size <= len(data["test"])
+ assert max_datasets_size <= len(data["train"]) and max_datasets_size <= len(data["test"])
train_data = data["train"].select(range(max_datasets_size))
test_data = data["test"].select(range(max_datasets_size))
tokenizer = make_tokenizer(model)
@@ -162,8 +142,7 @@ def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], ma
assert len(train_dataset) == len(test_dataset) == max_datasets_size
for i in range(max_datasets_size):
chosen_ids, c_mask, reject_ids, r_mask = train_dataset[i]
- assert chosen_ids.shape == c_mask.shape == \
- reject_ids.shape == r_mask.shape == torch.Size([max_length])
+ assert chosen_ids.shape == c_mask.shape == reject_ids.shape == r_mask.shape == torch.Size([max_length])
c_mask = c_mask.to(torch.bool)
r_mask = r_mask.to(torch.bool)
if chosen_ids.masked_select(c_mask)[-1] == tokenizer.eos_token_id:
@@ -180,8 +159,7 @@ def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], ma
assert torch.all(r_mask)
chosen_ids, c_mask, reject_ids, r_mask = test_dataset[i]
- assert chosen_ids.shape == c_mask.shape == \
- reject_ids.shape == r_mask.shape == torch.Size([max_length])
+ assert chosen_ids.shape == c_mask.shape == reject_ids.shape == r_mask.shape == torch.Size([max_length])
c_mask = c_mask.to(torch.bool)
r_mask = r_mask.to(torch.bool)
if chosen_ids.masked_select(c_mask)[-1] == tokenizer.eos_token_id:
@@ -198,7 +176,6 @@ def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], ma
assert torch.all(r_mask)
-
@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama", "chatglm"])
@pytest.mark.parametrize("dataset_path", ["yizhongw/self_instruct", None])
@pytest.mark.parametrize("max_dataset_size", [2])
@@ -214,10 +191,12 @@ def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size:
dataset_name = "sft_dataset.json"
with open(os.path.join(tmp_dir, dataset_name), "w") as f:
json.dump(SFT_DATASET, f)
- sft_dataset = SupervisedDataset(tokenizer=tokenizer,
- data_path=os.path.join(tmp_dir, dataset_name),
- max_datasets_size=max_dataset_size,
- max_length=max_length)
+ sft_dataset = SupervisedDataset(
+ tokenizer=tokenizer,
+ data_path=os.path.join(tmp_dir, dataset_name),
+ max_datasets_size=max_dataset_size,
+ max_length=max_length,
+ )
assert len(sft_dataset) == min(max_dataset_size, len(SFT_DATASET))
if isinstance(tokenizer, ChatGLMTokenizer):
@@ -227,20 +206,19 @@ def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size:
input_ids = sft_dataset[i]["input_ids"]
labels = sft_dataset[i]["labels"]
assert input_ids.shape == labels.shape == torch.Size([max_length])
-
+
ignore_mask = labels == IGNORE_INDEX
assert input_ids.masked_select(torch.logical_not(ignore_mask))[0] == tokenizer.bos_token_id
check_content(input_ids.masked_select(torch.logical_not(ignore_mask)), tokenizer, model)
return
-
+
for i in range(max_dataset_size):
assert isinstance(sft_dataset[i], dict)
assert list(sft_dataset[i].keys()) == ["input_ids", "labels", "attention_mask"]
input_ids = sft_dataset[i]["input_ids"]
labels = sft_dataset[i]["labels"]
attention_mask = sft_dataset[i]["attention_mask"].to(torch.bool)
- assert input_ids.shape == labels.shape == \
- attention_mask.shape == torch.Size([max_length])
+ assert input_ids.shape == labels.shape == attention_mask.shape == torch.Size([max_length])
if input_ids.masked_select(attention_mask)[-1] == tokenizer.eos_token_id:
check_content(input_ids.masked_select(attention_mask)[:-1], tokenizer, model)
assert torch.all(input_ids.masked_select(torch.logical_not(attention_mask)) == tokenizer.pad_token_id)
@@ -248,19 +226,16 @@ def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size:
check_content(input_ids.masked_select(attention_mask), tokenizer, model)
assert torch.all(attention_mask)
ignore_mask = labels == IGNORE_INDEX
- check_content(input_ids.masked_select(ignore_mask), tokenizer, model)
+ prompt_mask = torch.logical_and(ignore_mask, attention_mask)
+ check_content(input_ids.masked_select(prompt_mask), tokenizer, model)
+ assert torch.all(input_ids.masked_select(ignore_mask ^ prompt_mask) == tokenizer.pad_token_id)
if __name__ == "__main__":
test_sft_dataset(model="bloom", dataset_path="yizhongw/self_instruct", max_dataset_size=2, max_length=256)
- test_reward_dataset(model="gpt2",
- dataset_path="Anthropic/hh-rlhf",
- subset="harmless-base",
- max_datasets_size=8,
- max_length=256)
-
- test_prompt_dataset(model="opt",
- max_datasets_size=2,
- max_length=128)
+ test_reward_dataset(
+ model="gpt2", dataset_path="Anthropic/hh-rlhf", subset="harmless-base", max_datasets_size=8, max_length=256
+ )
+ test_prompt_dataset(model="opt", max_datasets_size=2, max_length=128)
diff --git a/applications/Chat/tests/test_experience.py b/applications/Chat/tests/test_experience.py
index 071e50b90e8e..a9591259800d 100644
--- a/applications/Chat/tests/test_experience.py
+++ b/applications/Chat/tests/test_experience.py
@@ -1,5 +1,5 @@
+import copy
import os
-from copy import deepcopy
import pytest
import torch
@@ -8,6 +8,7 @@
from coati.experience_maker import NaiveExperienceMaker
from coati.models.base import RewardModel
from coati.models.gpt import GPTActor, GPTCritic
+from coati.trainer.ppo import _set_default_generate_kwargs
from coati.trainer.strategies import DDPStrategy, GeminiStrategy
from coati.trainer.strategies.colossalai import LowLevelZeroStrategy
from transformers.models.gpt2.configuration_gpt2 import GPT2Config
@@ -18,7 +19,7 @@
def get_data(batch_size: int, seq_len: int = 10) -> dict:
- input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
+ input_ids = torch.randint(0, 50257, (batch_size, seq_len), device="cuda")
attention_mask = torch.ones_like(input_ids)
return dict(input_ids=input_ids, attention_mask=attention_mask)
@@ -37,34 +38,43 @@ def make_and_consume_experience(strategy):
EXPERIENCE_BATCH_SIZE = 4
SAMPLE_BATCH_SIZE = 2
- if strategy == 'ddp':
+ if strategy == "ddp":
strategy = DDPStrategy()
- elif strategy == 'colossalai-zero2':
+ elif strategy == "colossalai-zero2":
strategy = LowLevelZeroStrategy()
- elif strategy == 'colossalai-gemini':
- strategy = GeminiStrategy(placement_policy='cuda')
+ elif strategy == "colossalai-gemini":
+ strategy = GeminiStrategy(placement_policy="static")
else:
raise ValueError(f'Unsupported strategy "{strategy}"')
- actor = GPTActor(config=GPT_CONFIG).cuda()
- critic = GPTCritic(config=GPT_CONFIG).cuda()
+ with strategy.model_init_context():
+ actor = GPTActor(config=GPT_CONFIG).cuda()
+ critic = GPTCritic(config=GPT_CONFIG).cuda()
- initial_model = deepcopy(actor)
- reward_model = RewardModel(deepcopy(critic.model)).cuda()
+ initial_model = GPTActor(config=GPT_CONFIG).cuda()
+ reward_model = RewardModel(model=copy.deepcopy(critic.model)).cuda()
- experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model)
+ actor, critic, initial_model, reward_model = strategy.prepare(actor, critic, initial_model, reward_model)
+
+ class MockTokenizer:
+ def __init__(self):
+ self.padding_side = "left"
+ self.eos_token_id = 0
+ self.pad_token_id = 0
+
+ tokenizer = MockTokenizer()
+ experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, tokenizer)
data_buffer = NaiveExperienceBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
+ generate_kwargs = dict(do_sample=True, max_length=16)
+ generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
+
# experience of all ranks should be the same
for _ in range(2):
data = get_data(EXPERIENCE_BATCH_SIZE)
- assert gather_and_equal(data['input_ids'])
- assert gather_and_equal(data['attention_mask'])
- experience = experience_maker.make_experience(**data,
- do_sample=True,
- max_length=16,
- eos_token_id=50256,
- pad_token_id=50256)
+ assert gather_and_equal(data["input_ids"])
+ assert gather_and_equal(data["attention_mask"])
+ experience = experience_maker.make_experience(**data, do_sample=True, max_length=16)
assert gather_and_equal(experience.sequences)
assert gather_and_equal(experience.action_log_probs)
assert gather_and_equal(experience.values)
@@ -75,7 +85,7 @@ def make_and_consume_experience(strategy):
data_buffer.append(experience)
# data buffer's data should be the same
- buffer_size = torch.tensor([len(data_buffer)], device='cuda')
+ buffer_size = torch.tensor([len(data_buffer)], device="cuda")
assert gather_and_equal(buffer_size)
for item in data_buffer.items:
assert gather_and_equal(item.sequences)
@@ -88,7 +98,7 @@ def make_and_consume_experience(strategy):
# dataloader of each rank should have the same size and different batch
dataloader = strategy.setup_dataloader(data_buffer)
- dataloader_size = torch.tensor([len(dataloader)], device='cuda')
+ dataloader_size = torch.tensor([len(dataloader)], device="cuda")
assert gather_and_equal(dataloader_size)
for experience in dataloader:
assert not gather_and_equal(experience.sequences)
@@ -100,21 +110,21 @@ def make_and_consume_experience(strategy):
def run_dist(rank, world_size, port, strategy):
- os.environ['RANK'] = str(rank)
- os.environ['LOCAL_RANK'] = str(rank)
- os.environ['WORLD_SIZE'] = str(world_size)
- os.environ['MASTER_ADDR'] = 'localhost'
- os.environ['MASTER_PORT'] = str(port)
+ os.environ["RANK"] = str(rank)
+ os.environ["LOCAL_RANK"] = str(rank)
+ os.environ["WORLD_SIZE"] = str(world_size)
+ os.environ["MASTER_ADDR"] = "localhost"
+ os.environ["MASTER_PORT"] = str(port)
make_and_consume_experience(strategy)
@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [2])
-@pytest.mark.parametrize('strategy', ['ddp', 'colossalai-zero2', 'colossalai-gemini'])
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize("strategy", ["ddp", "colossalai-zero2", "colossalai-gemini"])
@rerun_if_address_is_in_use()
def test_experience(world_size, strategy):
spawn(run_dist, world_size, strategy=strategy)
-if __name__ == '__main__':
- test_experience(2, 'colossalai')
+if __name__ == "__main__":
+ test_experience(2, "colossalai-zero2")
diff --git a/applications/Chat/tests/test_models.py b/applications/Chat/tests/test_models.py
index b98b3615cd28..b2c22ac6a3b9 100644
--- a/applications/Chat/tests/test_models.py
+++ b/applications/Chat/tests/test_models.py
@@ -6,15 +6,16 @@
import torch.nn as nn
from coati.models.base import Actor, Critic, RewardModel, get_base_model
from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
+from coati.models.chatglm import ChatGLMActor
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
from coati.models.generation import generate
from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
-from coati.models.chatglm import ChatGLMActor
+from coati.models.llama import LlamaActor
from coati.models.lora import LoraLinear, convert_to_lora_module
from coati.models.loss import GPTLMLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
from coati.models.opt import OPTRM, OPTActor, OPTCritic
-from coati.models.utils import calc_action_log_probs, compute_reward, masked_mean
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
+from coati.models.utils import calc_action_log_probs, masked_mean
+
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seq_len", [32])
@@ -23,23 +24,34 @@
[
lambda: BLOOMActor(),
lambda: GPTActor(),
- # HACK: skip llama due to long execution time
- # lambda: LlamaActor(),
- lambda: OPTActor(),
- # lambda: ChatGLMActor(),
-])
-
-@pytest.mark.parametrize("generate_kwargs", [{
- "max_length": 64,
- "use_cache": True,
- "do_sample": True,
- "temperature": 1.0,
- "top_k": 50,
-}])
+ # HACK: skip llama due to long execution time
+ # lambda: LlamaActor(),
+ lambda: OPTActor(),
+ ],
+)
+@pytest.mark.parametrize(
+ "generate_kwargs",
+ [
+ {
+ "max_length": 64,
+ "use_cache": True,
+ "do_sample": True,
+ "temperature": 1.0,
+ "top_k": 50,
+ }
+ ],
+)
def test_generation(actor_maker: Callable[[], Actor], batch_size: int, seq_len: int, generate_kwargs: Dict[str, Any]):
+ class MockTokenizer:
+ def __init__(self):
+ self.padding_side = "left"
+ self.eos_token_id = 0
+ self.pad_token_id = 0
+
actor = actor_maker()
input_ids = torch.randint(0, 100, (batch_size, seq_len)).cuda()
- sequences = generate(actor.cuda(), input_ids, **generate_kwargs)
+ tokenizer = MockTokenizer()
+ sequences = generate(actor.cuda(), input_ids, tokenizer, **generate_kwargs)
assert sequences.shape == (batch_size, generate_kwargs["max_length"])
@@ -49,26 +61,12 @@ def test_utils():
assert fn_output.dim() == 0
assert torch.allclose(fn_output, torch.tensor(1.0))
- batch_size = 4
- num_labels = 10
- fn_input = {
- "r": torch.ones((batch_size,)),
- "kl_coef": 1.0,
- "log_probs": torch.randn((batch_size, num_labels)),
- "log_probs_base": torch.randn((batch_size, num_labels)),
- "action_mask": torch.randint(0, 2, (batch_size, num_labels))
- }
- fn_output = compute_reward(**fn_input)
- assert fn_output.shape == (batch_size,)
-
batch_size = 4
seq_len = 32
num_labels = 10
num_actions = 2
fn_input = {
- "output": {
- "logits": torch.randn((batch_size, seq_len, num_labels))
- },
+ "logits": torch.randn((batch_size, seq_len, num_labels)),
"sequences": torch.randint(0, num_labels, (batch_size, seq_len)),
"num_actions": num_actions,
}
@@ -105,8 +103,9 @@ def test_lora(lora_rank: int, num_dim: int, num_layers: int):
assert isinstance(lora_model[i], LoraLinear)
assert torch.allclose(old_model[i].weight, lora_model[i].weight)
assert torch.allclose(old_model[i].bias, lora_model[i].bias)
- assert not torch.allclose(old_model[i].lora_B @ old_model[i].lora_A,
- lora_model[i].lora_B @ lora_model[i].lora_A)
+ assert not torch.allclose(
+ old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A
+ )
@pytest.mark.parametrize("batch_size", [8])
@@ -116,54 +115,59 @@ def test_lora(lora_rank: int, num_dim: int, num_layers: int):
[
lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()),
lambda: (GPTActor(), GPTCritic(), GPTRM()),
- # HACK: skip llama due to long execution time
- # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
- lambda: (OPTActor(), OPTCritic(), OPTRM()),
- lambda: (ChatGLMActor(), None, None),
-])
+ # HACK: skip llama due to long execution time
+ # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
+ lambda: (OPTActor(), OPTCritic(), OPTRM()),
+ lambda: (ChatGLMActor(), None, None),
+ ],
+)
@torch.no_grad()
-def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]],
- batch_size: int,
- seq_len: int):
+def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]], batch_size: int, seq_len: int):
actor_input = {
"input_ids": torch.randint(0, 100, (batch_size, seq_len)),
- "attention_mask": torch.randint(0, 2, (batch_size, seq_len))
+ "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
}
critic_input = {
"sequences": torch.randint(0, 100, (batch_size, seq_len)),
- "action_mask": torch.randint(0, 2, (batch_size, seq_len)),
- "attention_mask": torch.randint(0, 2, (batch_size, seq_len))
+ "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
}
rm_input = {
"sequences": torch.randint(0, 100, (batch_size, seq_len)),
- "attention_mask": torch.randint(0, 2, (batch_size, seq_len))
+ "attention_mask": torch.randint(0, 2, (batch_size, seq_len)),
}
actor, critic, rm = models_maker()
if isinstance(actor, ChatGLMActor):
actor = actor.float()
- tokenizer = ChatGLMTokenizer.from_pretrained( "THUDM/chatglm-6b", trust_remote_code=True)
+ tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
chatglm_special_token = torch.tensor([tokenizer.gmask_token_id, tokenizer.bos_token_id]).repeat(batch_size, 1)
- actor_input ={
- "input_ids": torch.cat((torch.randint(0, 100, (batch_size, seq_len//2)), chatglm_special_token, torch.randint(0, 100, (batch_size, seq_len//2 - 2))), dim=1),
- "attention_mask": torch.randint(0, 2, (batch_size, 1, seq_len, seq_len))
- }
+ actor_input = {
+ "input_ids": torch.cat(
+ (
+ torch.randint(0, 100, (batch_size, seq_len // 2)),
+ chatglm_special_token,
+ torch.randint(0, 100, (batch_size, seq_len // 2 - 2)),
+ ),
+ dim=1,
+ ),
+ "attention_mask": torch.randint(0, 2, (batch_size, 1, seq_len, seq_len)),
+ }
assert isinstance(actor, Actor)
- base_actor_model = get_base_model(actor)
+ get_base_model(actor)
actor_output = actor(**actor_input)
assert actor_output.logits.shape[:2] == (batch_size, seq_len)
if critic:
assert isinstance(critic, Critic)
- base_critic_model = get_base_model(critic)
+ get_base_model(critic)
critic_output = critic(**critic_input)
- assert critic_output.shape == (batch_size, )
-
+ assert critic_output.shape == (batch_size,)
+
if rm:
assert isinstance(rm, RewardModel)
- base_rm_model = get_base_model(rm)
+ get_base_model(rm)
rm_output = rm(**rm_input)
- assert rm_output.shape == (batch_size, )
+ assert rm_output.shape == (batch_size,)
@pytest.mark.parametrize("batch_size", [16])
@@ -173,39 +177,59 @@ def test_loss(batch_size: int, seq_len: int, num_labels: int):
loss = GPTLMLoss()
loss_input = {
"logits": torch.randn(batch_size, seq_len, num_labels),
- "labels": torch.randint(0, num_labels, (batch_size, seq_len))
+ "labels": torch.randint(0, num_labels, (batch_size, seq_len)),
}
- loss_output = loss(**loss_input)
+ loss(**loss_input)
loss = PolicyLoss()
loss_input = {
- "log_probs": torch.randn(batch_size,),
- "old_log_probs": torch.randn(batch_size,),
- "advantages": torch.randn(batch_size,)
+ "log_probs": torch.randn(
+ batch_size,
+ ),
+ "old_log_probs": torch.randn(
+ batch_size,
+ ),
+ "advantages": torch.randn(
+ batch_size,
+ ),
}
- loss_output = loss(**loss_input)
+ loss(**loss_input)
loss = ValueLoss()
loss_input = {
- "values": torch.randn(batch_size,),
- "old_values": torch.randn(batch_size,),
- "reward": torch.randn(batch_size,)
+ "values": torch.randn(
+ batch_size,
+ ),
+ "old_values": torch.randn(
+ batch_size,
+ ),
+ "reward": torch.randn(
+ batch_size,
+ ),
}
- loss_output = loss(**loss_input)
+ loss(**loss_input)
loss = LogSigLoss()
loss_input = {
- "chosen_reward": torch.randn(batch_size,),
- "reject_reward": torch.randn(batch_size,),
+ "chosen_reward": torch.randn(
+ batch_size,
+ ),
+ "reject_reward": torch.randn(
+ batch_size,
+ ),
}
- loss_output = loss(**loss_input)
+ loss(**loss_input)
loss = LogExpLoss()
loss_input = {
- "chosen_reward": torch.randn(batch_size,),
- "reject_reward": torch.randn(batch_size,),
+ "chosen_reward": torch.randn(
+ batch_size,
+ ),
+ "reject_reward": torch.randn(
+ batch_size,
+ ),
}
- loss_output = loss(**loss_input)
+ loss(**loss_input)
if __name__ == "__main__":
@@ -218,4 +242,4 @@ def test_loss(batch_size: int, seq_len: int, num_labels: int):
test_models(models_maker=lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()), batch_size=8, seq_len=128)
- test_loss(batch_size=8, seq_len=128, num_labels=100)
\ No newline at end of file
+ test_loss(batch_size=8, seq_len=128, num_labels=100)
diff --git a/applications/Chat/tests/test_train.sh b/applications/Chat/tests/test_train.sh
index c5127c188612..68fca7fbf8c0 100755
--- a/applications/Chat/tests/test_train.sh
+++ b/applications/Chat/tests/test_train.sh
@@ -24,8 +24,8 @@ if [ -z "$SFT_DATASET" ]; then
exit 1
fi
-if [ -z "$PROMPT_PATH" ]; then
- echo "Please set \$PROMPT_PATH to the path to prompts csv."
+if [ -z "$PROMPT_DATASET" ]; then
+ echo "Please set \$PROMPT_DATASET to the path to prompts csv."
exit 1
fi
@@ -41,6 +41,7 @@ MODELS_DIR=$BASE_DIR/examples/models_config
MODELS=('gpt2' 'bloom' 'opt' 'llama')
STRATEGIES=('ddp' 'colossalai_gemini' 'colossalai_zero2')
+
export OMP_NUM_THREADS=8
# install requirements
@@ -74,6 +75,7 @@ echo "[Test]: testing sft ..."
# FIXME: This is a hack to skip tests that are not working
# - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
# - llama-*: These tests can be passed locally, skipped for long execution time
+# - *-gemini: Gemini plugin does not support `from_pretrained` yet
SKIPPED_TESTS=(
"gpt2-ddp"
"llama-ddp"
@@ -82,7 +84,7 @@ SKIPPED_TESTS=(
)
GRAD_CKPTS=('' '--grad_checkpoint')
-for lora_rank in '0' '4'; do
+for lora_rank in '0'; do
for model in ${MODELS[@]}; do
strategies=($(shuf -e "${STRATEGIES[@]}"))
for strategy in ${strategies[@]}; do
@@ -105,7 +107,7 @@ for lora_rank in '0' '4'; do
$pretrain_model --tokenizer $MODELS_DIR/$model \
--model $model --strategy $strategy --lora_rank $lora_rank $grad_ckpt \
--dataset $SFT_DATASET --max_datasets_size 8 \
- --max_epochs 1 --batch_size 1 --accumulation_steps 1 \
+ --max_epochs 1 --batch_size 1 --accumulation_steps 1 --lr 1e-8 \
--save_path $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank}
passed=$?
if [ $passed -eq 0 ]; then
@@ -125,6 +127,7 @@ echo "[Test]: testing reward model ..."
# FIXME: This is a hack to skip tests that are not working
# - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
# - llama-*: These tests can be passed locally, skipped for long execution time
+# - *-gemini: Gemini plugin does not support `from_pretrained` yet
SKIPPED_TESTS=(
"gpt2-ddp"
"llama-ddp"
@@ -134,7 +137,7 @@ SKIPPED_TESTS=(
LOSS_FNS=('log_sig' 'log_exp')
DATASETS=('Anthropic/hh-rlhf' 'Dahoas/rm-static')
-for lora_rank in '0' '4'; do
+for lora_rank in '0'; do
for model in ${MODELS[@]}; do
strategies=($(shuf -e "${STRATEGIES[@]}"))
for strategy in ${strategies[@]}; do
@@ -157,8 +160,9 @@ for lora_rank in '0' '4'; do
echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
torchrun --standalone --nproc_per_node=4 $EXAMPLES_DIR/train_reward_model.py \
$pretrain_model --tokenizer $MODELS_DIR/$model \
- --model $model --strategy $strategy --lora_rank $lora_rank --loss_fn $loss_fn \
- --dataset $dataset --subset $subset --test True --batch_size 1 \
+ --dataset $dataset --subset $subset --max_datasets_size 8 \
+ --model $model --strategy $strategy --lora_rank $lora_rank \
+ --loss_fn $loss_fn --batch_size 1 --lr 1e-8 \
--save_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt
passed=$?
if [ $passed -eq 0 ]; then
@@ -178,6 +182,7 @@ echo "[Test]: testing RLHF ..."
# FIXME: This is a hack to skip tests that are not working
# - gpt2-ddp: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
# - llama-*: These tests can be passed locally, skipped for long execution time
+# - *-gemini: Gemini plugin does not support `from_pretrained` yet
SKIPPED_TESTS=(
"gpt2-ddp"
"llama-ddp"
@@ -186,7 +191,7 @@ SKIPPED_TESTS=(
)
for model in ${MODELS[@]}; do
- for lora_rank in '0' '4'; do
+ for lora_rank in '0'; do
strategies=($(shuf -e "${STRATEGIES[@]}"))
for strategy in ${strategies[@]}; do
if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$strategy-$lora_rank " ]]; then
@@ -204,13 +209,13 @@ for model in ${MODELS[@]}; do
for i in $(seq $NUM_RETRY); do
echo "[Test]: $model-$strategy-$lora_rank, attempt $i"
torchrun --standalone --nproc_per_node=4 $EXAMPLES_DIR/train_prompts.py \
- --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \
+ --prompt_dataset $PROMPT_DATASET --pretrain_dataset $PRETRAIN_DATASET --max_datasets_size 32 \
--strategy $strategy --model $model --tokenizer $MODELS_DIR/$model \
- --num_episodes 1 --num_collect_steps 1 --num_update_steps 1 \
+ --num_episodes 1 --num_collect_steps 1 --num_update_steps 1 --lr 1e-8 \
--experience_batch_size 2 --train_batch_size 1 --lora_rank $lora_rank \
--pretrain $EXAMPLES_DIR/rlhf_models/sft_ckpt_${model}_${lora_rank} \
$rm_pretrain_model --rm_path $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt \
- --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts.pt
+ --save_path $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts
passed=$?
if [ $passed -eq 0 ]; then
break
@@ -225,4 +230,4 @@ for model in ${MODELS[@]}; do
rm $EXAMPLES_DIR/rlhf_models/rm_ckpt_${model}_${lora_rank}.pt
done
done
-rm $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts.pt
+rm -rf $EXAMPLES_DIR/rlhf_models/actor_checkpoint_prompts
diff --git a/applications/Colossal-LLaMA-2/README.md b/applications/Colossal-LLaMA-2/README.md
new file mode 100644
index 000000000000..ae2e0c6bb2db
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/README.md
@@ -0,0 +1,408 @@
+
+
+
+
+
+
+## Table of Contents
+- [News](#news)
+- [Colossal-LLaMA-2-7B](#colossal-llama-2-7b)
+ - [Performance Evaluation](#performance-evaluation)
+ - [Examples](#examples)
+ - [Training Logs](#training-logs)
+ - [Import from Transformers](#import-from-transformers)
+- [Usage](#usage)
+ - [Install](#install)
+ - [How to run](#how-to-run)
+- [Technical Insight](#technical-insights)
+ - [Data](#data)
+ - [Tokenizer](#tokenizer)
+ - [Training Strategy](#training-strategy)
+ - [Bridging Any Domain-specific Large Models](#bridging-any-domain-specific-large-models)
+- [Citations](#citations)
+
+## News
+* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
+[[blog]](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
+[[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base)
+[[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-7b-base/summary)
+
+
+## Colossal-LLaMA-2-7B
+The [Colossal-AI](https://github.com/hpcaitech/ColossalAI) team has introduced the open-source model **Colossal-LLaMA-2-7B-base**. This model, a derivation of LLaMA-2, has undergone continual pre-training involving approximately 8.5 billion tokens over a duration of 15 hours with 64 A800 GPUs. At a cost of **less than $1,000**, you can achieve results **similar to those that cost millions of dollars to pretrain from scratch**. It is licensed under the LLaMA-2 license and [Apache 2.0 License](https://github.com/hpcaitech/ColossalAI/blob/main/LICENSE) **without any additional commercial use restrictions**. This solution can also be used to build models of specific domain knowledge or tasks.
+
+Colossal-LLaMA-2-7B-base is designed to accommodate both the Chinese and English languages, featuring an expansive context window spanning 4096 tokens. Remarkably, it has exhibited exceptional performance when benchmarked against models of equivalent scale in standard Chinese and English evaluation metrics, including C-Eval and MMLU, among others.
+
+❗️**Important notice**:
+* All training data used for this project is collected from well-known public dataset.
+* We do not use any testing data from the evaluation benchmarks for training.
+
+### Performance Evaluation
+We conducted comprehensive evaluation on 4 dataset and compare our Colossal-Llama-2-7b-base model with various models.
+
+* We use 5-shot for MMLU and calculate scores based on the logits of first predicted token.
+* We use 5-shot for CMMLU and calculate scores based on the logits of first predicted token.
+* We use 5-shot for AGIEval and only calculate scores for 4-choice questions using a combination metric of exact match and the logits of first predicted token. If any of the exact match or logits of first predicted token is correct, the model will get the score.
+* We use 0-shot for GAOKAO-Bench and only calculate scores for 4-choice questions based on the logits of first predicted token.
+The generation config for all dataset is greedy search.
+* We also provided CEval scores from its lastest leaderboard or the official repository of the model.
+
+| | Backbone | Tokens Consumed | | MMLU | CMMLU | AGIEval | GAOKAO | CEval |
+| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :-----: | :----: | :----: | :------------------------------: |
+| | | - | | 5-shot | 5-shot | 5-shot | 0-shot | 5-shot |
+| Baichuan-7B | - | 1.2T | | 42.32 (42.30) | 44.53 (44.02) | 38.72 | 36.74 | 42.80 |
+| Baichuan-13B-Base | - | 1.4T | | 50.51 (51.60) | 55.73 (55.30) | 47.20 | 51.41 | 53.60 |
+| Baichuan2-7B-Base | - | 2.6T | | 46.97 (54.16) | 57.67 (57.07) | 45.76 | 52.60 | 54.00 |
+| Baichuan2-13B-Base | - | 2.6T | | 54.84 (59.17) | 62.62 (61.97) | 52.08 | 58.25 | 58.10 |
+| ChatGLM-6B | - | 1.0T | | 39.67 (40.63) | 41.17 (-) | 40.10 | 36.53 | 38.90 |
+| ChatGLM2-6B | - | 1.4T | | 44.74 (45.46) | 49.40 (-) | 46.36 | 45.49 | 51.70 |
+| InternLM-7B | - | 1.6T | | 46.70 (51.00) | 52.00 (-) | 44.77 | 61.64 | 52.80 |
+| Qwen-7B (original) | - | 2.2T | | 54.29 (56.70) | 56.03 (58.80) | 52.47 | 56.42 | 59.60 |
+| | | | | | | | | |
+| Llama-2-7B | - | 2.0T | | 44.47 (45.30) | 32.97 (-) | 32.60 | 25.46 | - |
+| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B | 1.0T | | 37.43 | 29.92 | 32.00 | 27.57 | - |
+| wenge-research/yayi-7b-llama2 | Llama-2-7B | - | | 38.56 | 31.52 | 30.99 | 25.95 | - |
+| ziqingyang/chinese-llama-2-7b | Llama-2-7B | - | | 33.86 | 34.69 | 34.52 | 25.18 | 34.2 |
+| TigerResearch/tigerbot-7b-base | Llama-2-7B | 0.3T | | 43.73 | 42.04 | 37.64 | 30.61 | - |
+| LinkSoul/Chinese-Llama-2-7b | Llama-2-7B | - | | 48.41 | 38.31 | 38.45 | 27.72 | - |
+| FlagAlpha/Atom-7B | Llama-2-7B | 0.1T | | 49.96 | 41.10 | 39.83 | 33.00 | - |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1 | Llama-13B | 0.11T | | 50.25 | 40.99 | 40.04 | 30.54 | - |
+| | | | | | | | | |
+| **Colossal-LLaMA-2-7b-base** | Llama-2-7B | **0.0085T** | | 53.06 | 49.89 | 51.48 | 58.82 | 50.2 |
+
+> The score in parentheses corresponds to the scores in the official repository of the model.
+>
+> We use zero-shot for ChatGLM models.
+>
+> Qwen-7B is now inaccessible in Hugging Face, we are using the latest version of it before it was made inaccessible. Only for dataset MMLU, the prompt would be "xxx Answer:"(remove the space after ":") and we calculate the logits over " A", " B", " C" and " D" for Qwen-7B. Qwen-7B tends to be much more deterministic than other models. For example, the logits over " A" can be `-inf` and softmax would be exact `0`.
+>
+> For other models and other dataset, we calculate logits over "A", "B", "C" and "D".
+
+❗️ More details of the evaluation methods and reproduction of the results, please refer to [ColossalEval](https://github.com/hpcaitech/ColossalAI/tree/main/applications/ColossalEval).
+
+### Examples
+| Question Type | Question |
Colossal-LLaMA-2-7b-base
|
+| :------: | :-----------------------: | :----------- |
+| Knowledge Quiz (Tourism) | 请问武汉的最佳的旅行季节是什么时候? | 3-4月份到武汉赏樱,10月份到武汉看黄陂枫树。武汉的最佳的旅行季节是春秋,3-4月份到武汉赏樱,10月份到武汉看黄陂枫树。武汉,简称“汉”别称江城,是湖北省省会,中部六省唯一的副省级市,特大城市,中国中部地区的中心城市,全国重要的工业基地、科教基地和综合交通枢纽。 |
+| Knowledge Quiz (Computer Science) | 在计算机中,什么是RAM?与ROM有什么区别? | RAM是随机存取内存,可以随时读写,但是断电之后会丢失数据。存储数据比较快,但是需要不断的进行通电供电。ROM是只读内存,只能读取数据,不能写入数据。断电之后,存储的数据也不会丢失,但是需要不断进行通电。两者的价格和性能上会有区别,而且在使用的时候,RAM需要不断的供电,而ROM断电数据不丢失。 |
+| Knowledge Quiz (Literature) | 古代文学与现代文学有哪些区别和共通之处? | 古代文学与现代文学有三点区别,现代文学更加通俗易懂,具有可读性,而古代文学相对来说更加注重形式,缺乏可读性。现代文学和古代文学,同属于中华文学大花园中的一枝花蕾。但是它们由于历史的原因,有着不同的特点,具体表现在,现代文学有更丰富的描写方法,更强烈的主观色彩和更突出的抒情色彩,而古代文学的主要特征是在创作风格上的多样化。 |
+| Emotional Analysis | 请判断这个句子的情绪是否正面:我迫不及待想见到她!!!”。 | 这一句是情绪是正向的。是在期待见面的喜悦情绪。 |
+| Translation | 将以下句子翻译成英语。 我喜欢看电影和读书。 | I like watching movies and reading books. |
+| Logical Reasoning | All books have the same weight, 10 books weigh 5kg, what is the weight of 2 books? | To find the weight of 2 books, we can use the following steps:
1. Determine the weight of 1 book. 2. Multiply the weight of 1 book by the number of books.
Step 1: Determine the weight of 1 book. Since 10 books weigh 5kg, we can divide 5kg by 10 to find the weight of 1 book. 5kg ÷ 10 = 0.5kg
Step 2: Multiply the weight of 1 book by the number of books. To find the weight of 2 books, we multiply the weight of 1 book by 2. 0.5kg × 2 = 1kg
So, the weight of 2 books is 1kg. |
+| Information Extraction | The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence. | The date mentioned in the sentence is October 20th to October 23rd, 2021. |
+| Error Correction | Please correct the syntax error and restructure the sentence: "Went to the store me and my friend." | The given sentence "Went to the store me and my friend." is incorrect due to a syntax error and it lacks a subject. The corrected version with proper grammar and sentence structure would be: "My friend and I went to the store." |
+
+❗️ More examples of question answering, please refer to [Colossal-LLaMA-2-7B-base Examples](docs/example.md).
+
+### Training Logs
+We also recorded the training logs for the experiment
+
+
+
+
+
+
+
+
+
+### Import from Transformers (Inference)
+To load Colossal-LLaMA-2-7B-base model using Transformers, use the following code:
+```Python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained("hpcai-tech/Colossal-LLaMA-2-7b-base", device_map="auto", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("hpcai-tech/Colossal-LLaMA-2-7b-base", trust_remote_code=True)
+input = "离离原上草,"
+inputs = tokenizer(input, return_tensors='pt')
+inputs = inputs.to('cuda:0')
+pred = model.generate(**inputs,
+ max_new_tokens=256,
+ do_sample=True,
+ top_k=50,
+ top_p=0.95,
+ num_return_sequences=1)
+print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)[len(input):])
+```
+
+You can also load our model using modelscope, use the following code:
+```Python
+from modelscope import AutoModelForCausalLM, AutoTokenizer, snapshot_download
+model_dir = snapshot_download('colossalai/Colossal-LLaMA-2-7b-base', revision='v1.0.1')
+tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
+generation_kwargs = {"max_new_tokens": 256,
+ "top_p": 0.95,
+ "temperature": 0.3
+ }
+input = '离离原上草,'
+inputs = tokenizer(input, return_token_type_ids=False, return_tensors='pt')
+inputs = inputs.to('cuda:0')
+output = model.generate(**inputs, **generation_kwargs)
+print(tokenizer.decode(output.cpu()[0], skip_special_tokens=True)[len(input):])
+```
+You can download model weights from [🤗HuggingFace](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base) or [👾Modelscope](https://modelscope.cn/models/colossalai/Colossal-LLaMA-2-7b-base/summary).
+
+## Usage
+### Install
+
+#### 0. Pre-requisite
+1. This experiment was performed on 8 computing nodes with 64 A800 GPUs in total for LLaMA-2-7B (**about 1000 USD cost**). The nodes are connected with RDMA and GPUs within one node are fully connected with NVLink. The script was tested with CUDA 11.7, CUDA version requires 11.7 or higher. You can also complete it in about 5 days on a 8*A100/A800 server.
+
+2. PyTorch. The PyTorch version should be less than 2.0.0 and greater than 1.12.1.
+
+
+#### 1. Install required packages
+```
+cd Colossal-LLaMA-2
+pip install -r requirements.txt
+```
+#### 2. Install `xentropy`, `layer_norm` and `rotary`
+```bash
+git clone git@github.com:Dao-AILab/flash-attention.git
+# At the root folder
+cd csrc/xentropy && pip install .
+# At the root folder
+cd csrc/layer_norm && pip install .
+# At the root folder
+cd csrc/rotary && pip install .
+```
+
+### How to run
+
+#### 1. Init Tokenizer Preparation
+Initialize new tokenizer with additional Chinese tokens. Additional Chinese tokens are stored in `jsonl` format as follows:
+```json
+{"piece": "你好"}
+{"piece": "人工智能"}
+```
+Command to initialize new tokenizer:
+```bash
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python'
+python colossal_llama2/tokenizer/init_tokenizer.py \
+ --source_tokenizer_dir "" \
+ --target_tokenizer_dir "" \
+ --expand_tokens_file ".jsonl"
+```
+Here is details about CLI arguments:
+* Source tokenizer directory: `--source_tokenizer_dir`. Directory to the source tokenizer. It should at least contain three files: `special_tokens_map.json`, `tokenizer.model` and `tokenizer_config.json`.
+* Target tokenizer directory: `--target_tokenizer_dir`. Directory to the target tokenizer.
+* Tokens to be added: `--expand_tokens_file`. Additional tokens to be added to the tokenizer.
+
+#### 2. Init Model Preparation
+Initialize the new model checkpoint by calculating the mean values from the original model checkpoint.
+Command to initialize new model checkpoint:
+```bash
+python colossal_llama2/model/init_model.py \
+ --source_model_and_tokenizer_path "" \
+ --target_tokenizer_path "" \
+ --target_model_path ""
+```
+"" can be the same as "".
+
+Here is details about CLI arguments:
+* Source model and tokenizer path: `--source_model_and_tokenizer_path`. Source folder contains both model and tokenizer, for example, LLaMA-2 model in Hugging Face format.
+* Target tokenizer path: `--target_tokenizer_path`. Path to the new tokenizer folder generated from previous step.
+* Target model path: `--target_model_path`. Path to save the new model in Hugging Face format.
+
+❗️**Important**: Once you initialize the new model checkpoint, copy your new tokenizer files (`special_tokens_map.json`, `tokenizer.model` and `tokenizer_config.json`) to your new model folder.
+
+#### 3. Data Preparation
+Raw data should be formatted as `jsonl` format. Each data point should have the following fields:
+* `source` (str, compulsory): This part is ignored when calculating loss. Default can be empty.
+* `target` (str, compulsory): Loss will be calculated.
+* `category` (str, compulsory): Tags for each data point.
+
+Examples:
+```JSON
+{"source": "", "target": "Lionel Andrés Messi(Spanish pronunciation: [ljoˈnel anˈdɾes ˈmesi] (i); born 24 June 1987), also known as Leo Messi, is an Argentine professional footballer who plays as a forward for and captains both Major League Soccer club Inter Miami and the Argentina national team.", "category": "sports"}
+{"source": "猜谜语:一身卷卷细毛,吃的青青野草,过了数九寒冬,无私献出白毛。(打一动物)", "target": "白羊", "category": "riddle"}
+```
+You are allowed to customize the category tags or use `unknown` to define the category.
+
+Command to convert jsonl dataset to arrow format:
+```
+python prepare_pretrain_dataset.py \
+ --data_input_dirs ",," \
+ --tokenizer_dir "" \
+ --data_cache_dir "jsonl_to_arrow_cache" \
+ --data_jsonl_output_dir "spliced_tokenized_output_jsonl" \
+ --data_arrow_output_dir "spliced_tokenized_output_arrow" \
+ --max_length 4096 \
+ --num_spliced_dataset_bins 10
+```
+Here is details about CLI arguments:
+* Source data directory: `data_input_dirs`. Each `` can have multiple file in `jsonl` format.
+* Tokenzier directory: `tokenizer_dir`. Path to the tokenizer in Hugging Face format.
+* Data cache directory: `data_cache_dir`. Directory to store Hugging Face data cache. Default case will create `cache` folder locally.
+* Output directory for jsonl format: `data_jsonl_output_dir`. Output directory to store converted dataset in jsonl format.
+* Output directory for arrow format: `data_arrow_output_dir`. Output directory to store converted dataset in arrow format, which can be used for training directly.
+* Max length: `max_length`. Max length of spliced samples. Default value is 4096.
+* Number of bins for each category: `num_spliced_dataset_bins`. Number of bins for each category, used for bucket-based training.
+
+#### 4. Command Line Arguments for Training
+You can use `colossalai run` to launch multi-nodes training:
+```bash
+colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \
+train.py --OTHER_CONFIGURATIONS
+```
+Here is a sample hostfile:
+```bash
+hostname1
+hostname2
+hostname3
+hostname4
+```
+Make sure master node can access all nodes (including itself) by ssh without password.
+
+Here is details about CLI arguments:
+* Pre-trained model path: `--pretrained`. Path to the pre-trained model in Hugging Face format.
+* Dataset path: `--dataset`. Path to the pre-tokenized dataset.
+* Booster plugin: `--plugin`. `gemini`, `gemini_auto`, `zero2`,`zero2_cpu` and `3d` are supported.For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins/).
+* Intermediate checkpoint to load: `--load_checkpoint`. Path to the intermediate checkpoint. Saved checkpoint contains the states for `lr_scheduler`, `optimizer`,`running_states.json` and `modelling`. If `load_checkpoint` points to the `modelling` folder, only the model weights will be loaded without any other states to support multi-stage training.
+* Save interval: `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000.
+* Checkpoint directory: `--save_dir`. The directoty path to save checkpoint and intermediate states. Intermediate states include `lr_scheduler`, `optimizer`,`running_states.json` and `modelling`.
+* Tensorboard directory: `--tensorboard_dir`. The path to save tensorboard logs.
+* Configuration file: `--config_file`. The path to save the configuration file.
+* Number of epochs: `--num_epochs`. Number of training epochs. The default value is 1.
+* Micro batch size: `--micro_batch_size`. Batch size per GPU. The default value is 1.
+* Learning rate: `--lr`. The default value is 3e-4.
+* Max length: `--max_length`. Max context length. The default value is 4096.
+* Mixed precision: `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
+* Gradient clipping: `--gradient_clipping`. The default value is 1.0.
+* Weight decay: `-w`, `--weight_decay`. The default value is 0.1.
+* Warmup steps: `-s`, `--warmup_steps`. The default value is calcuated by 0.025 warmup ratio.
+* Gradient checkpointing: `--use_grad_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size.
+* Flash attention: `--use_flash_attn`. If you want to use flash attention, you must install `flash-attn` and related packages. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention.
+* Freeze non-embedding parameters: `--freeze_non_embeds_params`. Freeze non-embedding parameters. It can be helpful to align embeddings after extending vocabulary size.
+* Tensor parallelism size: `--tp`. TP size for 3d Parallelism. The default value is 1.
+* Zero stage: `--zero`. Zero stage for 3d Parallelism. The default value is 1.
+
+#### 5. Running Command
+An [example bash](train.example.sh) is also provided for the experiment. Here is the steps to run the experiment:
+* Create your own hostfile: `cp hostfile.example hostfile`.
+* Create your own bash: `cp train.example.sh train.sh`.
+* Add your real host ip or host name into the `hostfile`.
+* Update global variables and parameters in your `train.sh`.
+* Run the experiment by `bash train.sh`
+
+Here is the details about global variables for each experiment:
+* `PROJECT_NAME`: Project name for each experiment.
+* `PARENT_SAVE_DIR`: Parent folder to save model checkpoint.
+* `PARENT_TENSORBOARD_DIR`: Parent folder to save tensorboard logs.
+* `PARENT_CONFIG_FILE`: Parent folder to save configuration for each experiment.
+* `PRETRAINED_MODEL_PATH`: Path to the local pre-trained model checkpoint.
+* `dataset`: Paths to all prepared data. Typically, it's a list of subfolders within the output path of prepare data, `--data_arrow_output_dir`, and if there are multiple subfolders, please list them all. e.g.,
+```python
+declare -a dataset=(
+ "/part-00000"
+ "/part-00001"
+ "/part-00000"
+)
+```
+## Technical Insights
+In order to enhance LLaMA-2's capabilities for understanding and generating Chinese content, The [Colossal-AI](https://github.com/hpcaitech/ColossalAI) team proposes the continuation of pre-training the LLaMA-2 model using both Chinese and English corpora. The overall pipeline can be described as follows:
+
+
+
+
+
+### Data
+Large language models such as LLaMA-2 have undergone training using a heterogeneous blend of high-quality datasets, yielding promising outcomes. Enhancing LLaMA-2's performance for the Chinese corpus, while preserving its proficiency in English, critically hinges on two pivotal factors: the composition of the dataset, which encompasses both English and Chinese content, and the quality of each constituent dataset.
+
+The following figure shows the data processing pipeline conducted for Colossal-LLaMA-2.
+
+
+
+
+❗️**Important**: We will open-source our data-processing toolkit soon, stay tuned!
+
+### Tokenizer
+The original LLaMA-2 vacabulary comprises fewer than a thousand Chinese characters, thus proves inadequate for encoding comprehensive Chinese texts effectively. Secondly, the utilization of byte tokens presents a challenge for transformer encoders to capture the semantic nuances of Chinese characters.
+
+To address the above issues, we extend LLaMA-2 vocabulary from 32,000 to 69,104. To adapt the LLaMA-2 model for use with the Colossal-LLaMA-2 tokenizer, we initialize the new word embeddings by calculating the mean values from the original LLaMA-2 embeddings and subsequently append these new rows to the end of the original embedding matrices.
+
+Advantages of extending vocabulary size:
+* Improve the compression rate of string sequence encoding.
+* Enhance the integrity of information.
+* Enable encoded sequences to contain more valuable information, thereby theoretically enhancing the ability for chapter-level encoding.
+
+Advantages of large vocabulary size under low-resource settings:
+* The presence of numerous unused tokens can be attributed to the limited training dataset, where an excessive number of tokens might not have been effectively learned.
+* Excessive vocabulary expansion leads to an increase in embedding-related parameters, resulting in higher memory usage, which, in turn, affects the efficiency of the training process.
+
+To balance both sides, we finally construct our vocabulary with size 69,104. The following table below presents a comparison of various models at the 7B level.
+
+| Model | Vocabulary Size | Compression Rate | Average Length of Samples (token-level) |
+| :-----------: | :---------: | :----: | :----: |
+| Colossal-LLaMA-2 | 69104 | 0.659 | 73.682 |
+| LLaMA-2-7B | 32000 | 1.205 | 134.689 |
+| Atom-7B | 65000 | 0.634 | 70.915 |
+| Baichuan-7B | 64000 | 0.678 | 75.857 |
+| Baichuan2-7B-base | 125696 | 0.570 | 63.761 |
+| Chatglm2-6B | 64789 | 0.645 | 72.178 |
+| InternLM-7B | 103168 | 0.566 | 63.349 |
+| Qwen-7B | 151643 | 0.578 | 64.703 |
+| Tigerbot-7B-base | 60515 | 0.630 | 70.515 |
+| Yayi-7B-llama2 | 32005 | 1.214 | 135.689 |
+| Chinese-llama-2-7b | 55296 | 0.668 | 74.690 |
+| Chinese-Falcon-7B | 90046 | 0.669 | 74.858 |
+| LinkSoul-Chinese-Llama-2-7b | 40076 | 0.958 | 107.089 |
+| Ziya-LLaMA-13B-v1.1 | 39410 | 0.958 | 107.074 |
+
+
+### Training Strategy
+#### Multi-stage Training
+In order to enhance the model's performance and harness the full potential of the original LLaMA-2, we have developed a multi-stage training strategy. This strategy is designed to systematically unlock the model's capabilities over a series of stages.
+
+Therefore, we have divided the training process into three stages:
+* Large-scale pre-training stage (Conducted by LLaMA-2): This initial stage is aimed at establishing the model's foundational capabilities from the ground up. It necessitates the use of a substantial dataset comprising no less than 1 trillion tokens.
+* Chinese knowledge injection stage: In this stage, we introduce Chinese knowledge into the model. It requires access to a high-quality dataset rich in comprehensive knowledge relevant to the Chinese language.
+* Knowledge replay stage: Knowledge is replayed through a question-answering (QA) mechanism, encompassing both the Chinese and English domains.
+
+Following the completion of this multi-stage training process, the model exhibits notable improvements in performance across both English and Chinese benchmarks.
+
+The following figure illustrates the three stages for training Colossal-LLaMA-2.
+
+
+
+
+
+#### Bucket-based Training
+Our experiments have revealed that the distributions within the training dataset, as well as the arrangement of various topic-related data points, significantly impact the overall performance of the model, particularly in the context of continual pre-training of LLaMA-2.
+
+In an effort to achieve a more balanced distribution and exert control over the dataset's ordering, we have adopted a method where we divide each sub-dataset into discrete bins. These bins are then combined to construct individual data buckets, with one bin contributed by each sub-dataset.
+
+### Bridging Any Domain-specific Large Models
+Applying the above process to perform knowledge transfer in any field allows for the cost-effective construction of lightweight domain-specific foundational large models.
+
+
+
+
+
+## Citations
+```bibtex
+@article{bian2021colossal,
+ title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
+ author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
+ journal={arXiv preprint arXiv:2110.14883},
+ year={2021}
+}
+```
+```bibtex
+@misc{touvron2023llama,
+ title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
+ author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
+ year={2023},
+ eprint={2307.09288},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
+```bibtex
+@article{dao2023flashattention2,
+ title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
+ author={Dao, Tri},
+ year={2023}
+}
+}
+```
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/__init__.py b/applications/Colossal-LLaMA-2/colossal_llama2/__init__.py
new file mode 100644
index 000000000000..56fafa58b3f4
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/__init__.py b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/__init__.py
new file mode 100644
index 000000000000..56fafa58b3f4
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
new file mode 100644
index 000000000000..a2cfb2ef6264
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import os
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Union, Sequence, Optional, Iterator, Callable
+
+import torch
+from datasets import dataset_dict, load_from_disk
+from datasets import Dataset as HFDataset
+from torch.distributed import ProcessGroup
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.utils.data import ConcatDataset, Dataset, DataLoader, DistributedSampler
+from transformers.tokenization_utils import PreTrainedTokenizer
+import torch.nn.functional as F
+
+DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
+PathType = Union[str, os.PathLike]
+
+
+def load_tokenized_dataset(
+ dataset_paths: Union[PathType, List[PathType]], mode: str = "train"
+) -> Optional[DatasetType]:
+ """
+ Load pre-tokenized dataset.
+ Each instance of dataset is a dictionary with
+ `{'input_ids': List[int], 'labels': List[int], sequence: str}` format.
+ """
+ mode_map = {"train": "train", "dev": "validation", "test": "test"}
+ assert mode in tuple(mode_map), f"Unsupported mode {mode}, it must be in {tuple(mode_map)}"
+
+ if isinstance(dataset_paths, (str, os.PathLike)):
+ dataset_paths = [dataset_paths]
+
+ datasets = [] # `List[datasets.dataset_dict.Dataset]`
+ for ds_path in dataset_paths:
+ ds_path = os.path.abspath(ds_path)
+ assert os.path.exists(ds_path), f"Not existed file path {ds_path}"
+ ds_dict = load_from_disk(dataset_path=ds_path, keep_in_memory=False)
+ if isinstance(ds_dict, HFDataset):
+ datasets.append(ds_dict)
+ else:
+ if mode_map[mode] in ds_dict:
+ datasets.append(ds_dict[mode_map[mode]])
+ if len(datasets) == 0:
+ return None
+ if len(datasets) == 1:
+ return datasets.pop()
+ return ConcatDataset(datasets=datasets)
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+ """
+ Collate instances for supervised dataset.
+ Each instance is a tokenized dictionary with fields
+ `input_ids`(List[int]), `labels`(List[int]) and `sequence`(str).
+ """
+
+ tokenizer: PreTrainedTokenizer
+ max_length: int = 4096
+ ignore_index: int = -100
+
+ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
+ """
+
+ Args:
+ instances (`Sequence[Dict[str, List[int]]]`):
+ Mini-batch samples, each sample is stored in an individual dictionary.
+
+ Returns:
+ (`Dict[str, torch.Tensor]`): Contains the following `torch.Tensor`:
+ `input_ids`: `torch.Tensor` of shape (bsz, max_len);
+ `attention_mask`: `torch.BoolTensor` of shape (bsz, max_len);
+ `labels`: `torch.Tensor` of shape (bsz, max_len), which contains `IGNORE_INDEX`.
+ """
+ assert isinstance(self.tokenizer.pad_token_id, int) and self.tokenizer.pad_token_id >= 0, (
+ f"`{self.tokenizer.__class__.__name__}.pad_token_id` must be a valid non-negative integer index value, "
+ f"but now `{self.tokenizer.pad_token_id}`"
+ )
+
+ # `List[torch.Tensor]`
+ batch_input_ids = [
+ torch.LongTensor(instance["input_ids"][: self.max_length])
+ if len(instance["input_ids"]) > self.max_length
+ else torch.LongTensor(instance["input_ids"])
+ for instance in instances
+ ]
+ batch_labels = [
+ torch.LongTensor(instance["labels"][: self.max_length])
+ if len(instance["labels"]) > self.max_length
+ else torch.LongTensor(instance["labels"])
+ for instance in instances
+ ]
+
+ if self.tokenizer.padding_side == "right":
+ input_ids = torch.nn.utils.rnn.pad_sequence(
+ sequences=batch_input_ids,
+ batch_first=True,
+ padding_value=self.tokenizer.pad_token_id,
+ ) # (bsz, max_len)
+ labels = torch.nn.utils.rnn.pad_sequence(
+ sequences=batch_labels,
+ batch_first=True,
+ padding_value=self.ignore_index,
+ ) # (bsz, max_len)
+ # pad to max
+ to_pad = self.max_length - input_ids.size(1)
+ input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
+ labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
+ elif self.tokenizer.padding_side == "left":
+ reversed_input_ids = [seq.flip(dims=(0,)) for seq in batch_input_ids]
+ reversed_input_ids = torch.nn.utils.rnn.pad_sequence(
+ sequences=reversed_input_ids,
+ batch_first=True,
+ padding_value=self.tokenizer.pad_token_id,
+ ) # (bsz, max_len)
+ input_ids = torch.flip(reversed_input_ids, dims=(1,)) # (bsz, max_len)
+ reversed_labels = [seq.flip(dims=(0,)) for seq in batch_labels]
+ reversed_labels = torch.nn.utils.rnn.pad_sequence(
+ sequences=reversed_labels,
+ batch_first=True,
+ padding_value=self.ignore_index,
+ ) # (bsz, max_len)
+ labels = torch.flip(reversed_labels, dims=(1,)) # (bsz, max_len)
+ else:
+ raise RuntimeError(
+ f"`{self.tokenizer.__class__.__name__}.padding_side` can only be `left` or `right`, "
+ f"but now `{self.tokenizer.padding_side}`"
+ )
+
+ attention_mask = input_ids.ne(self.tokenizer.pad_token_id) # `torch.BoolTensor`, (bsz, max_len)
+
+ return dict(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+
+
+class StatefulDistributedSampler(DistributedSampler):
+ """
+ Stateful distributed sampler for multi-stage training.
+ """
+
+ def __init__(
+ self,
+ dataset: DatasetType,
+ num_replicas: Optional[int] = None,
+ rank: Optional[int] = None,
+ shuffle: bool = True,
+ seed: int = 0,
+ drop_last: bool = False,
+ ) -> None:
+ super().__init__(
+ dataset=dataset,
+ num_replicas=num_replicas,
+ rank=rank,
+ shuffle=shuffle,
+ seed=seed,
+ drop_last=drop_last,
+ )
+ self.start_index = 0
+
+ def __iter__(self) -> Iterator:
+ iterator = super().__iter__()
+ indices = list(iterator)
+ indices = indices[self.start_index :]
+ return iter(indices)
+
+ def __len__(self) -> int:
+ return self.num_samples - self.start_index
+
+ def set_start_index(self, start_index: int) -> None:
+ self.start_index = start_index
+
+
+def setup_distributed_dataloader(
+ dataset: DatasetType,
+ batch_size: int = 1,
+ shuffle: bool = False,
+ seed: int = 1024,
+ drop_last: bool = False,
+ pin_memory: bool = False,
+ num_workers: int = 0,
+ collate_fn: Callable[[Sequence[Dict[str, Union[str, List[int]]]]], Dict[str, torch.Tensor]] = None,
+ process_group: Optional[ProcessGroup] = None,
+ **kwargs,
+) -> DataLoader:
+ """
+ Setup dataloader for distributed training.
+ """
+ _kwargs = kwargs.copy()
+ process_group = process_group or _get_default_group()
+ sampler = StatefulDistributedSampler(
+ dataset=dataset,
+ num_replicas=process_group.size(),
+ rank=process_group.rank(),
+ shuffle=shuffle,
+ seed=seed,
+ drop_last=drop_last,
+ )
+
+ # Deterministic dataloader
+ def seed_worker(worker_id: int) -> None:
+ worker_seed = seed
+ np.random.seed(worker_seed)
+ torch.manual_seed(worker_seed)
+ random.seed(worker_seed)
+
+ return DataLoader(
+ dataset=dataset,
+ batch_size=batch_size,
+ sampler=sampler,
+ num_workers=num_workers,
+ collate_fn=collate_fn,
+ pin_memory=pin_memory,
+ drop_last=drop_last,
+ worker_init_fn=seed_worker,
+ **_kwargs,
+ )
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py
new file mode 100644
index 000000000000..0c21f325ae62
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Splicing multiple pre-tokenized sequence data points
+"""
+
+import random
+import warnings
+from copy import deepcopy
+from datasets import dataset_dict
+from typing import Any, Callable, Dict, Iterable, List, Union, Tuple
+
+from torch.utils.data import ConcatDataset, Dataset, IterableDataset
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+IGNORE_INDEX = -100
+
+DSType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
+
+
+def supervised_tokenize(
+ data_point: Dict[str, str], tokenizer: LlamaTokenizer, ignore_index: int = None, max_length: int = 4096
+) -> Dict[str, Union[int, str, List[int]]]:
+ """
+ A tokenization function to tokenize an original pretraining data point as following:
+ {"source": "", "target": "Beijing, the capital of the People's Republic of China, ...", "category": "geography"}
+ """
+ assert tokenizer.add_bos_token is False and tokenizer.add_eos_token is False, (
+ "Initially set `tokenizer.add_bos_token` and `tokenizer.add_eos_token` to False, "
+ "add and manually later"
+ )
+ if ignore_index is None:
+ ignore_index = IGNORE_INDEX
+
+ source_text = data_point["source"] # `str`
+ target_text = data_point["target"] # `str`
+ is_null_source = len(source_text) == 0
+
+ source_text = tokenizer.bos_token + source_text
+ target_text += tokenizer.eos_token
+ sequence_text = source_text + target_text
+
+ tokenized = tokenizer([source_text, sequence_text])["input_ids"]
+ sequence_input_ids = tokenized[1]
+ sequence_labels = deepcopy(sequence_input_ids)
+
+ source_length = len(tokenized[0])
+ if not is_null_source:
+ sequence_labels[:source_length] = [ignore_index for _ in range(source_length)]
+
+ # sequence truncation.
+ if len(sequence_input_ids) > max_length:
+ sequence_input_ids = sequence_input_ids[:max_length]
+ sequence_labels = sequence_labels[:max_length]
+
+ return dict(
+ input_ids=sequence_input_ids,
+ labels=sequence_labels,
+ seq_length=len(sequence_input_ids),
+ seq_category=data_point["category"],
+ )
+
+
+class ClosedToConstantLengthSplicedDataset(IterableDataset):
+ """
+ Define an iterable dataset that returns a (close to) constant length data point spliced from multiple
+ original independent (pre-tokenized) data points.
+ """
+
+ def __init__(
+ self,
+ dataset: DSType,
+ tokenizer: PreTrainedTokenizer,
+ max_length: int = 4096,
+ num_packed_sequences: int = 8,
+ fetch_sequence_func: Callable[[Any], Tuple[List[int], List[int]]] = None,
+ input_ids_field: str = "input_ids",
+ labels_field: str = "labels",
+ infinite: bool = False,
+ shuffle: bool = True,
+ error_strict: bool = False,
+ ) -> None:
+ self.tokenizer = tokenizer
+ self.dataset = dataset
+ self.max_length = max_length
+ self.infinite = infinite
+ self.max_buffer_size = max_length * num_packed_sequences # e.g., 4096 * 16
+ self.shuffle = shuffle
+
+ # Callable[[Dict[str, Any]], Tuple[List[int], List[int]]],
+ # A function that fetch sequence input_ids and labels from the original data point
+ if fetch_sequence_func is None:
+ self.fetch_sequence_func = lambda data_point: (data_point[input_ids_field], data_point[labels_field])
+ else:
+ self.fetch_sequence_func = fetch_sequence_func
+ self.input_ids_field = input_ids_field
+ self.labels_field = labels_field
+
+ self.error_strict = error_strict
+ self.current_size = 0 # `int`, current packed data size.
+
+ def __len__(self) -> int:
+ return len(self.dataset)
+
+ def __iter__(self) -> Iterable[Dict[str, List[int]]]:
+ iterator = iter(self.dataset)
+ more_data_points = True
+ while more_data_points is True:
+ buffer, buffer_len = [], 0
+ while True:
+ # ending condition.
+ if buffer_len >= self.max_buffer_size:
+ break
+ try:
+ # `Tuple[List[int], List[int]]`
+ seq_input_ids, seq_labels = self.fetch_sequence_func(next(iterator))
+ buffer.append({self.input_ids_field: seq_input_ids, self.labels_field: seq_labels})
+ buffer_len += len(buffer[-1][self.input_ids_field])
+ except StopIteration:
+ if self.infinite is True:
+ iterator = iter(self.dataset)
+ warnings.warn("The dataset reached end and the iterator is reset to the start.")
+ else:
+ more_data_points = False
+ break
+ examples = [] # `List[Dict[str, List[int]]]`, save buffered spliced data points.
+ spliced_input_ids, spliced_labels = [], [] # `List[int]`, `List[int]`
+ for i, data_point in enumerate(buffer):
+ # TODO(2023-09-18) check errors for each unspliced tokenized data point
+ seq_input_ids = data_point[self.input_ids_field]
+ seq_labels = data_point[self.labels_field]
+ # Handle special case:
+ # If the length of an original data point (i.e., input_ids length of a data point before splicing)
+ # exceeds `max_length`, truncate it.
+ if len(seq_input_ids) > self.max_length:
+ truncated_seq_input_ids = seq_input_ids[: self.max_length]
+ truncated_label_ids = seq_labels[: self.max_length]
+ if set(truncated_label_ids) == {IGNORE_INDEX}:
+ if self.error_strict is True:
+ raise ValueError(
+ f"Find an out-of-bounds length({len(seq_input_ids)}) data point "
+ f"with all label values as {IGNORE_INDEX}."
+ )
+ else:
+ warnings.warn(f"Filter an error truncated data point (labels all {IGNORE_INDEX})")
+ continue # Skip the current error data point.
+ spliced_data_point = {
+ self.input_ids_field: truncated_seq_input_ids,
+ self.labels_field: truncated_label_ids,
+ }
+ examples.append(spliced_data_point)
+ warnings.warn("Find a data point to be truncated.")
+ continue
+
+ # Pre action judgment.
+ if len(spliced_input_ids) + len(seq_input_ids) > self.max_length:
+ spliced_data_point = {
+ self.input_ids_field: spliced_input_ids,
+ self.labels_field: spliced_labels,
+ } # `Dict[str, List[int]]`
+ # Update.
+ spliced_input_ids, spliced_labels = [], []
+ spliced_input_ids.extend(seq_input_ids)
+ spliced_labels.extend(seq_labels)
+ examples.append(spliced_data_point)
+ else:
+ spliced_input_ids.extend(seq_input_ids)
+ spliced_labels.extend(seq_labels)
+ # For residual spliced data point at the end of the data set
+ if self.infinite is False and more_data_points is False and len(spliced_input_ids) > 0:
+ examples.append(
+ {
+ self.input_ids_field: spliced_input_ids,
+ self.labels_field: spliced_labels
+ }
+ )
+ if self.shuffle:
+ random.shuffle(examples)
+ for spliced_data_point in examples:
+ # TODO(2023-09-18): check errors for each spliced tokenized data point.
+ self.current_size += 1
+ yield spliced_data_point
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py b/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
new file mode 100644
index 000000000000..67e487f43b08
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Initialize new model with updated tokenizer by calculating the mean values from original model
+"""
+import argparse
+
+import numpy as np
+import torch
+from transformers import LlamaTokenizer, LlamaForCausalLM
+
+from colossalai.logging import get_dist_logger
+
+
+logger = get_dist_logger()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--source_model_and_tokenizer_path",
+ type=str,
+ required=True,
+ default=None,
+ help="Source path of model & tokenizer",
+ )
+ parser.add_argument("--target_tokenizer_path", type=str, required=True, default=None, help="Target tokenizer path")
+ parser.add_argument("--target_model_path", type=str, required=True, default=None, help="Target model path")
+ args = parser.parse_args()
+
+ source_tokenizer = LlamaTokenizer.from_pretrained(args.source_model_and_tokenizer_path)
+ source_tokenizer.add_bos_token = False
+ source_tokenizer.add_eos_token = False
+ if source_tokenizer.pad_token is None:
+ source_tokenizer.pad_token = source_tokenizer.unk_token
+ source_vocab = source_tokenizer.get_vocab()
+
+ target_tokenizer = LlamaTokenizer.from_pretrained(args.target_tokenizer_path)
+ target_tokenizer.add_bos_token = False
+ target_tokenizer.add_eos_token = False
+ if target_tokenizer.pad_token is None:
+ target_tokenizer.pad_token = target_tokenizer.unk_token
+ target_vocab = target_tokenizer.get_vocab()
+ target_inverted_vocab = {v: k for k, v in target_vocab.items()}
+
+ assert len(target_vocab) > len(
+ source_vocab
+ ), f"Target vocab size({len(target_vocab)}) must be greater than source vocab size({len(source_vocab)})"
+
+ gpu_device = torch.device("cuda:0")
+ cpu_device = torch.device("cpu")
+
+ source_model = LlamaForCausalLM.from_pretrained(args.source_model_and_tokenizer_path)
+ source_model.eval()
+ source_model = source_model.to(gpu_device)
+
+ source_input_embeddings = source_model.get_input_embeddings()
+ assert isinstance(source_input_embeddings, torch.nn.Embedding)
+ assert source_input_embeddings.weight.shape[0] == len(source_vocab)
+ source_input_embeddings.eval()
+
+ source_output_embeddings = source_model.get_output_embeddings()
+ assert isinstance(source_output_embeddings, torch.nn.Linear)
+ assert source_output_embeddings.bias is None
+ assert source_output_embeddings.weight.shape[0] == len(source_vocab)
+ source_output_embeddings.eval()
+
+ input_embeddings = source_input_embeddings.weight.cpu().detach().numpy()
+ output_embeddings = source_output_embeddings.weight.cpu().detach().numpy()
+ for i in range(len(source_vocab), len(target_vocab)):
+ if i % 500 == 0:
+ logger.info(f"processing {i}/{len(target_vocab)} target tokens")
+ target_token = target_inverted_vocab[i]
+ target_to_source_token_ids = torch.LongTensor(source_tokenizer([target_token])["input_ids"][0])
+ target_to_source_token_ids = target_to_source_token_ids.to(gpu_device)
+
+ target_to_source_input_embedding = (
+ source_input_embeddings.weight[target_to_source_token_ids]
+ .mean(dim=0)
+ .unsqueeze(dim=0)
+ .cpu()
+ .detach()
+ .numpy()
+ )
+ target_to_source_output_embedding = (
+ source_output_embeddings.weight[target_to_source_token_ids]
+ .mean(dim=0)
+ .unsqueeze(dim=0)
+ .cpu()
+ .detach()
+ .numpy()
+ )
+
+ input_embeddings = np.concatenate((input_embeddings, target_to_source_input_embedding), axis=0)
+ output_embeddings = np.concatenate((output_embeddings, target_to_source_output_embedding), axis=0)
+
+ source_model = source_model.to(cpu_device)
+ assert isinstance(source_model, LlamaForCausalLM)
+
+ # expand
+ source_model.resize_token_embeddings(new_num_tokens=len(target_vocab))
+ source_model.model.embed_tokens.weight.data = torch.Tensor(input_embeddings)
+ source_model.lm_head.weight.data = torch.Tensor(output_embeddings)
+
+ source_model = source_model.half()
+ source_model.save_pretrained(save_directory=args.target_model_path)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py b/applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py
new file mode 100644
index 000000000000..43297633db1a
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+Initialize new tokenizer for continual pre-training
+"""
+
+import argparse
+import os
+import json
+from typing import List, Union
+
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
+from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
+
+from colossalai.logging import get_dist_logger
+
+os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+
+logger = get_dist_logger()
+
+
+def expand_vocab_tokenizer(
+ source_tokenizer_dir: Union[str, os.PathLike], target_tokenizer_dir: Union[str, os.PathLike], new_tokens: List[str]
+) -> None:
+ """Expand tokenizer for continue pre-training."""
+ if os.path.exists(target_tokenizer_dir):
+ raise RuntimeError(f"Find existed directory {target_tokenizer_dir}")
+
+ source_tokenizer = LlamaTokenizer.from_pretrained(source_tokenizer_dir)
+ logger.info(source_tokenizer)
+ source_sp_processor = source_tokenizer.sp_model
+ source_spm = sp_pb2_model.ModelProto()
+ source_spm.ParseFromString(source_sp_processor.serialized_model_proto())
+
+ logger.info(f"Source tokenizer size: {len(source_sp_processor)}")
+
+ # Add new tokens to source tokenizer.
+ source_spm_tokens = set([p.piece for p in source_spm.pieces])
+ for piece in new_tokens:
+ assert isinstance(piece, str), f"Invalid token({piece}) type {type(piece)}"
+ if piece in source_spm_tokens:
+ # Skip existed token.
+ continue
+ new_p = sp_pb2_model.ModelProto().SentencePiece()
+ new_p.piece = piece
+ new_p.score = 0
+ source_spm.pieces.append(new_p)
+ logger.info(f"Expand vocab from {len(source_spm_tokens)} to {len(source_spm.pieces)}")
+
+ # Save
+ os.makedirs(target_tokenizer_dir)
+ target_tokenizer_model_path = os.path.join(target_tokenizer_dir, "tokenizer.model")
+ with open(file=target_tokenizer_model_path, mode="wb") as fp:
+ fp.write(source_spm.SerializeToString())
+
+ target_tokenizer = LlamaTokenizer(vocab_file=target_tokenizer_model_path)
+ target_tokenizer.save_pretrained(save_directory=target_tokenizer_dir)
+ logger.info(f"Successfully save expand tokenizer to {target_tokenizer_dir}")
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--source_tokenizer_dir", type=str, required=True, default=None, help="Source tokenizer directory"
+ )
+ parser.add_argument(
+ "--target_tokenizer_dir", type=str, required=True, default=None, help="Target tokenizer directory"
+ )
+ parser.add_argument(
+ "--expand_tokens_file",
+ type=str,
+ required=True,
+ default=None,
+ help="Path of the file containing tokens to be extended",
+ )
+ args = parser.parse_args()
+
+ expand_tokens = []
+ with open(file=args.expand_tokens_file, mode="r", encoding="utf-8") as fp_reader:
+ for line in fp_reader:
+ item = json.loads(line)
+ # e.g., {"piece": "你好"}
+ token = item["piece"]
+ if token in expand_tokens:
+ continue
+ expand_tokens.append(token)
+ expand_tokens.sort(key=lambda t: len(t), reverse=False)
+
+ expand_vocab_tokenizer(
+ source_tokenizer_dir=args.source_tokenizer_dir,
+ target_tokenizer_dir=args.target_tokenizer_dir,
+ new_tokens=expand_tokens,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/__init__.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/__init__.py
new file mode 100644
index 000000000000..56fafa58b3f4
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/utils/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py
new file mode 100644
index 000000000000..85decf37dd0b
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Helper functions for IO
+"""
+
+import json
+import os
+from typing import Any, Dict, Tuple, Union
+
+import torch
+from torch.optim.optimizer import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+
+
+def load_json(file_path: Union[str, os.PathLike]) -> Dict[str, Any]:
+ """
+ Load file in JSON format
+ """
+ with open(file=file_path, mode="r", encoding="utf-8") as fp:
+ return json.load(fp)
+
+
+def save_json(data: Dict[str, Any], file_path: Union[str, os.PathLike]) -> None:
+ """
+ Save as JSON format
+ """
+ with open(file=file_path, mode="w", encoding="utf-8") as fp:
+ json.dump(data, fp=fp, ensure_ascii=False, indent=4)
+
+
+def save_checkpoint(
+ save_dir: Union[str, os.PathLike],
+ booster: Booster,
+ model: torch.nn.Module,
+ optimizer: Optimizer,
+ lr_scheduler: _LRScheduler,
+ epoch: int,
+ step: int,
+ batch_size: int,
+ coordinator: DistCoordinator,
+) -> None:
+ """
+ Save model checkpoint, optimizer, LR scheduler and intermedidate running states.
+ """
+
+ save_dir = os.path.join(save_dir, f"epoch-{epoch}_step-{step}")
+ os.makedirs(os.path.join(save_dir, "modeling"), exist_ok=True)
+
+ booster.save_model(model, os.path.join(save_dir, "modeling"), shard=True)
+
+ booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True)
+ booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
+ running_states = {
+ "epoch": epoch,
+ "step": step,
+ "sample_start_index": step * batch_size,
+ }
+ if coordinator.is_master():
+ save_json(running_states, os.path.join(save_dir, "running_states.json"))
+
+
+def load_checkpoint(
+ load_dir: Union[str, os.PathLike],
+ booster: Booster,
+ model: torch.nn.Module,
+ optimizer: Optimizer,
+ lr_scheduler: _LRScheduler,
+) -> Tuple[int, int, int]:
+ """
+ Load model checkpoint, optimizer, LR scheduler and intermedidate running states.
+ """
+
+ # Update booster params states.
+ booster.load_model(model=model, checkpoint=os.path.join(load_dir, "modeling"))
+ booster.load_optimizer(optimizer=optimizer, checkpoint=os.path.join(load_dir, "optimizer"))
+ booster.load_lr_scheduler(lr_scheduler=lr_scheduler, checkpoint=os.path.join(load_dir, "lr_scheduler"))
+
+ running_states = load_json(file_path=os.path.join(load_dir, "running_states.json"))
+ return (
+ running_states["epoch"],
+ running_states["step"],
+ running_states["sample_start_index"],
+ )
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
new file mode 100644
index 000000000000..1926ec78aba8
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from types import MethodType
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_varlen_kvpacked_func
+from flash_attn.ops.rms_norm import rms_norm
+from transformers.models.llama.modeling_llama import (
+ LlamaAttention,
+ LlamaForCausalLM,
+ LlamaModel,
+ LlamaRMSNorm,
+ apply_rotary_pos_emb,
+ repeat_kv,
+)
+
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def _prepare_decoder_attention_mask(
+ self: LlamaModel,
+ attention_mask: torch.BoolTensor,
+ input_shape: torch.Size,
+ inputs_embeds: torch.Tensor,
+ past_key_values_length: int,
+) -> Optional[torch.Tensor]:
+ """
+ Decoder attetion mask
+ """
+ if past_key_values_length > 0 and attention_mask is not None:
+ attention_mask = torch.cat(
+ tensors=(
+ torch.full(
+ size=(input_shape[0], past_key_values_length),
+ fill_value=True,
+ dtype=attention_mask.dtype,
+ device=attention_mask.device,
+ ),
+ attention_mask,
+ ),
+ dim=-1,
+ ) # (bsz, past_key_values_length + q_len)
+ if attention_mask is not None and torch.all(attention_mask):
+ return None # Faster
+ return attention_mask
+
+
+def attention_forward(
+ self: LlamaAttention,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """
+ Re-define LLaMA-2 `LlamaAttention` forward method using flash-attention.
+ """
+ if output_attentions:
+ logger.warning(
+ "Argument `output_attentions` is not supported for flash-attention patched `LlamaAttention`, "
+ "return `None` instead."
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.config.pretraining_tp > 1:
+ q_slicing, kv_slicing = (
+ dim // self.config.pretraining_tp
+ for dim in (
+ self.num_heads * self.head_dim,
+ self.num_key_value_heads * self.head_dim,
+ )
+ ) # `Tuple[int, int]`
+ q_slices, k_slices, v_slices = (
+ proj.weight.split(slicing, dim=0)
+ for proj, slicing in (
+ (self.q_proj, q_slicing),
+ (self.k_proj, kv_slicing),
+ (self.v_proj, kv_slicing),
+ )
+ ) # Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor], Tuple[torch.Tensor]]
+ q, k, v = (
+ torch.cat(
+ [F.linear(hidden_states, slices[i]) for i in range(self.config.pretraining_tp)],
+ dim=-1,
+ )
+ for slices in (q_slices, k_slices, v_slices)
+ )
+ # `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]` of shape:
+ # (bsz, q_len, num_heads * head_dim),
+ # (bsz, q_len, num_key_value_heads * head_dim),
+ # (bsz, q_len, num_key_value_heads * head_dim)
+ else:
+ q, k, v = (proj(hidden_states) for proj in (self.q_proj, self.k_proj, self.v_proj))
+ # `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]` of shape:
+ # (bsz, q_len, num_heads * head_dim),
+ # (bsz, q_len, num_key_value_heads * head_dim),
+ # (bsz, q_len, num_key_value_heads * head_dim)
+
+ # (bsz, q_len, num_heads * head_dim) -> (bsz, num_heads, q_len, head_dim);
+ # (bsz, q_len, num_key_value_heads * head_dim) -> (bsz, num_key_value_heads, q_len, head_dim);
+ # (bsz, q_len, num_key_value_heads * head_dim) -> (bsz, num_key_value_heads, q_len, head_dim)
+ q, k, v = (
+ states.view(bsz, q_len, num_heads, self.head_dim).transpose(1, 2)
+ for states, num_heads in (
+ (q, self.num_heads),
+ (k, self.num_key_value_heads),
+ (v, self.num_key_value_heads),
+ )
+ )
+ kv_len = k.shape[-2] # initially, `kv_len` == `q_len`
+ past_kv_len = 0
+ if past_key_value is not None:
+ # if `past_key_value` is not None, `kv_len` > `q_len`.
+ past_kv_len = past_key_value[0].shape[-2]
+ kv_len += past_kv_len
+
+ # two `torch.Tensor` objs of shape (1, 1, kv_len, head_dim)
+ cos, sin = self.rotary_emb(v, seq_len=kv_len)
+ # (bsz, num_heads, q_len, head_dim), (bsz, num_key_value_heads, q_len, head_dim)
+ q, k = apply_rotary_pos_emb(q=q, k=k, cos=cos, sin=sin, position_ids=position_ids)
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ k = torch.cat([past_key_value[0], k], dim=2)
+ v = torch.cat([past_key_value[1], v], dim=2)
+
+ past_key_value = (k, v) if use_cache else None
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ k = repeat_kv(hidden_states=k, n_rep=self.num_key_value_groups)
+ # (bsz, num_key_value_heads, q_len, head_dim) -> (bsz, num_heads, q_len, head_dim)
+ v = repeat_kv(hidden_states=v, n_rep=self.num_key_value_groups)
+ # (bsz, num_key_value_heads, q_len, head_dim) -> (bsz, num_heads, q_len, head_dim)
+
+ key_padding_mask = attention_mask
+ # (bsz, num_heads, q_len, head_dim) -> (bsz, q_len, num_heads, head_dim)
+ q, k, v = (states.transpose(1, 2) for states in (q, k, v))
+
+ if past_kv_len > 0:
+ q = torch.cat(
+ tensors=(
+ torch.full(
+ size=(bsz, past_kv_len, self.num_heads, self.head_dim),
+ fill_value=0.0,
+ dtype=q.dtype,
+ device=q.device,
+ ),
+ q,
+ ),
+ dim=1,
+ ) # (bsz, past_kv_len + q_len, num_heads, head_dim)
+
+ if key_padding_mask is None:
+ # (bsz, past_kv_len + q_len, num_heads, head_dim)
+ output = flash_attn_func(q=q, k=k, v=v, dropout_p=0.0, softmax_scale=None, causal=True) # (bsz, )
+ output = rearrange(output, pattern="... h d -> ... (h d)") # (bsz, past_kv_len + q_len, num_heads * head_dim)
+ else:
+ q, indices, cu_q_lens, max_q_len = unpad_input(hidden_states=q, attention_mask=key_padding_mask)
+ kv, _, cu_kv_lens, max_kv_len = unpad_input(
+ hidden_states=torch.stack(tensors=(k, v), dim=2),
+ attention_mask=key_padding_mask,
+ )
+ output_unpad = flash_attn_varlen_kvpacked_func(
+ q=q,
+ kv=kv,
+ cu_seqlens_q=cu_q_lens,
+ cu_seqlens_k=cu_kv_lens,
+ max_seqlen_q=max_q_len,
+ max_seqlen_k=max_kv_len,
+ dropout_p=0.0,
+ softmax_scale=None,
+ causal=True,
+ )
+ output = pad_input(
+ hidden_states=rearrange(output_unpad, pattern="nnz h d -> nnz (h d)"),
+ indices=indices,
+ batch=bsz,
+ seqlen=past_kv_len + q_len,
+ ) # (bsz, past_kv_len + q_len, num_heads * head_dim)
+
+ if past_kv_len > 0:
+ # Strip off the zero query outputs.
+ output = output[:, past_kv_len:, ...] # (bsz, q_len, num_heads * head_dim)
+ output = self.o_proj(output) # (bsz, q_len, hidden_size)
+ return output, None, past_key_value
+
+
+def rms_norm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor) -> torch.Tensor:
+ """
+ Formard function for RMS Norm
+ """
+ return rms_norm(x=hidden_states, weight=self.weight, epsilon=self.variance_epsilon)
+
+
+def replace_with_flash_attention(model: LlamaForCausalLM) -> None:
+ for name, module in model.named_modules():
+ if isinstance(module, LlamaAttention):
+ module.forward = MethodType(attention_forward, module)
+ if isinstance(module, LlamaModel):
+ module._prepare_decoder_attention_mask = MethodType(_prepare_decoder_attention_mask, module)
+ if isinstance(module, LlamaRMSNorm):
+ module.forward = MethodType(rms_norm_forward, module)
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/froze.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/froze.py
new file mode 100644
index 000000000000..82677160d868
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/utils/froze.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from transformers.models.llama import LlamaForCausalLM
+
+
+def freeze_non_embeds_parameters(model: LlamaForCausalLM) -> None:
+ """Freeze all parameters except embeddings."""
+ for name, params in model.named_parameters():
+ if "embed_tokens" not in name and "lm_head" not in name:
+ params.requires_grad = False
+ else:
+ params.requires_grad = True
+
+
+def unfreeze_parameters(model: LlamaForCausalLM) -> None:
+ for name, params in model.named_parameters():
+ params.requires_grad = False
diff --git a/applications/Colossal-LLaMA-2/docs/example.md b/applications/Colossal-LLaMA-2/docs/example.md
new file mode 100644
index 000000000000..d889ab4165d0
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/docs/example.md
@@ -0,0 +1,245 @@
+# Colossal-LLaMA-2-7B-base Examples
+To comprehensively assess the performance of the Colossal-LLaMA-2-7B-base model, our team conducted human evaluations across various knowledge domains and tasks. These tasks encompassed Knowledge QA in 10 different areas, Content Generation, Brainstorming, Summarization, Sentiment Analysis, Logical Reasoning, Information Extraction, Role-play, Classification, and Rewriting. We also conducted a comparative analysis, pitting the generation results of Colossal-LLaMA-2-7B-base against Qwen-7b-base, a recently-pretrained model known for its impressive performance. It's worth noting that both models shared identical inference parameters. For each question, we generated model responses multiple times and selected the best one as the final answer.
+
+
+
+## Table of Contents
+- [Examples](#knowledge-qa)
+ - [Knowledge QA](#knowledge-qa)
+ - [Travel](#travel)
+ - [Medical](#medical)
+ - [Food](#food)
+ - [Computer Science](#computer-science)
+ - [Game](#game)
+ - [Sports](#sports)
+ - [Finance](#finance)
+ - [Law](#law)
+ - [Technology](#technology)
+ - [Literature](#literature)
+ - [Content Generation](#content-generation)
+ - [Writing](#writing)
+ - [Code](#code)
+ - [E-mail](#e-mail)
+ - [LaTex Table](#latex-table)
+ - [Brainstorming](#brainstorming)
+ - [Summarization](#summarization)
+ - [Sentiment Analysis](#sentiment-analysis)
+ - [Logical Reasoning](#logical-reasoning)
+ - [Information Extraction](#information-extraction)
+ - [Role-play](#role-play)
+ - [Classification](#classification)
+ - [Rewriting](#rewriting)
+- [Conclusion](#conclusion)
+
+
+### Knowledge QA
+#### Travel
+| Question |
以上是几个适合家庭度假的目的地,希望能给您提供一些参考。 |
+| 请问武汉的最佳的旅行季节是什么时候? | 3-4月份到武汉赏樱,10月份到武汉看黄陂枫树。武汉的最佳的旅行季节是春秋,3-4月份到武汉赏樱,10月份到武汉看黄陂枫树。武汉,简称“汉”别称江城,是湖北省省会,中部六省唯一的副省级市,特大城市,中国中部地区的中心城市,全国重要的工业基地、科教基地和综合交通枢纽。 | Assistant: 武汉的最佳旅行季节是春季和秋季。春季气温宜人,可以赏樱花,秋季天气宜人,可以欣赏红叶。夏季温度较高,游客众多,冬季则有雪景和温泉可赏,但冬季的交通和住宿可能会更加拥挤。 |
+| What are amazing islands near Singapore that could be visited? | Singapore is one of the most developed and well-connected cities in the world, with an impressive skyline and modern infrastructure. However, there are many amazing islands near Singapore that are worth visiting. Here are some of the most popular and beautiful islands near Singapore:
1. Sentosa Island: Sentosa Island is a small island located off the southern coast of Singapore. It is a popular tourist destination, known for its beaches, theme parks, and lush greenery. Visitors can enjoy a range of activities on the island, including sunbathing, swimming, and exploring the jungles.
2. Pulau Ubin: Pulau Ubin is a small island located about 45 minutes away from Singapore by boat. It is a popular destination for nature lovers, offering hiking trails, cycling paths, and a variety of wildlife. Visitors can also explore the fishing village and try their hand at fishing.
3. Pulau Tekong: Pulau Tekong is a small island located about 30 minutes away from Singapore by boat. It is a former military base, but is now a popular destination for nature lovers. Visitors can enjoy hiking trails, cycling paths, and a variety of wildlife.
4. Pulau Sibuyu: Pulau Sibuyu is a small island located about 20 minutes away from Singapore by boat. It is a popular destination for water sports enthusiasts, offering snorkeling, diving, and kayaking. Visitors can also explore the fishing village and try their hand at fishing.
5. Pulau Rapa: Pulau Rapa is a small island located about 20 minutes away from Singapore by boat. It is a popular destination for birdwatchers, offering a variety of bird species and birdwatching spots. Visitors can also explore the fishing village and try their hand at fishing. Overall, these are some of the amazing islands near Singapore that are worth visiting. Whether you are looking for adventure, relaxation, or simply want to explore the natural beauty of Singapore, these islands offer something for everyone. | "What are some good islands near Singapore with a short distance? Which one is recommended?"
Please tell me if those questions are the same. OPTIONS: - no - yes
no |
+| Can you recommend local place to eat breakfast and dinner in Shanghai? | Yes, I can certainly recommend some great local places to eat breakfast and dinner in Shanghai. Here are some suggestions:
1. Dumpling House: This popular restaurant serves delicious dumplings and other Chinese dishes for breakfast and dinner. They have multiple locations throughout Shanghai, so you can easily find one near you.
2. Yuyuan Brewers Fresh Beer Restaurant: This unique restaurant offers a variety of fresh beers and delicious food, including breakfast and dinner options. It's a fun and relaxed atmosphere to enjoy your meal.
3. Zenbox Coffee & Bakery: This trendy coffee shop is the perfect spot to start your day with a delicious breakfast sandwich or pastry. They also have a great selection of lunch options, so you can continue enjoying your meal in the afternoon.
4. Shanghai Taste: This authentic Shanghai restaurant offers traditional Chinese dishes, including noodles and stir-fry dishes, for breakfast and dinner. The cozy atmosphere and friendly service make it a great choice for a meal.
5. XLR8 Family Cafe: This family-friendly cafe offers breakfast and dinner options, including a healthy salad bar. They also have a kids' menu with plenty of options for little ones.
These are just a few examples of the many great places to eat breakfast and dinner in Shanghai. With so many options to choose from, you're sure to find something that suits your taste and budget. | I am traveling on budget and will have access to public transportation to reach the location.
Assistant: Sure! Here are some local places to eat breakfast and dinner in Shanghai that you might want to consider:
Breakfast:
1. Fresh on the Square - They have a variety of breakfast options, including sandwiches, smoothies, and porridge. The prices are affordable, and they have a location at People's Square. 2. The Dumpling House - This restaurant is known for its delicious dumplings and offers an extensive menu of Chinese breakfast items, such as baozi, soy milk, and jianbing. The location is near the Jing'an Temple subway station. 3. Shanghainese Roast Duck - This restaurant is famous for its crispy duck, but they also have breakfast items like rice porridge, nian gao (sticky rice cake), and soy milk. The location is near the Xinzhuang Road subway station.
Dinner:
1. Lost Heaven - This restaurant offers a fusion of Chinese and Western cuisine in a beautiful garden setting. Prices are on the higher side, but the quality of food is excellent. 2. Shanghainese Restaurant - This restaurant is a must-try if you want to experience authentic Shanghai food. They have a variety of Shanghainese dishes, such as hairy crab, xiao long bao (steamed soup dumplings), and braised pork belly. The location is near the Hongqiao Railway Station. 3. Lao Da Zhou - This restaurant is known for its delicious xiao long bao and other Shanghainese dishes. Prices are reasonable, and they have a location near the Zhongshan Park subway station.
I hope this helps you plan your budget-friendly food itinerary in Shanghai! |
+
+
+
+#### Medical
+| Question |
请注意,以上方法只是缓解症状的方法,不能替代医生的诊断和治疗。建议您尽快就医,以便早日恢复健康。 |
+| How can one treat intense pain in the stomach while suffering from ulcer? | Intense pain in the stomach is a common symptom of ulcers, and it can be quite uncomfortable and painful. It is essential to seek medical attention as soon as possible to obtain an accurate diagnosis and appropriate treatment. Treatment for ulcer-related pain usually involves managing pain with medication, addressing the underlying cause of the ulcer, and maintaining a healthy lifestyle.
1. Manage pain with medication:
There are several over-the-counter pain relievers available that can help alleviate symptoms. Examples of these medications include nonsteroidal anti-inflammatory drugs (NSAIDs), such as ibuprofen and naproxen, as well as acetaminophen (Tylenol). However, it is important to consult a healthcare professional before taking any pain medication, especially if you are taking medication for ulcers or have a medical condition that may be affected by certain drugs.
2. Address the underlying cause of the ulcer:
Ulcers can be caused by a variety of factors, including bacterial infections, H. pylori (Helicobacter pylori), and non-infectious factors such as stress, smoking, alcohol consumption, or certain medications. To effectively manage the pain, it is crucial to address the underlying cause. In cases of H. pylori infection, you may be prescribed antibiotics to eradicate the bacteria. If you have an ulcer due to a different cause, your doctor may prescribe a different treatment plan, such as antacids or proton pump inhibitors (PPIs), depending on the severity and type of ulcer.
3. Maintain a healthy lifestyle:
To promote healing and reduce the risk of future ulcers, it is essential to maintain a healthy lifestyle. Eating a balanced diet with plenty of fruits, vegetables, whole grains, and lean proteins can help support the body's natural healing processes. Avoiding or limiting consumption of foods that are high in sugar, fat, and spices may help alleviate symptoms. Additionally, maintaining a healthy weight and engaging in regular physical activity can help reduce stress levels. | I am suffering from ulcer since 5 months. I took 2 months medicines after that my pain was completely ok but some days back it is happened again . I also take medicines which doctor have prescribed. Now I am suffering from stomach pain and acidity. Please tell me what to do and what can't I do in this situation? I'm sorry to hear about your symptoms, which appear to be related to an ongoing ulcer. It is important that you continue to take your prescribed medications as directed by your healthcare provider, and to follow their recommended dietary and lifestyle modifications as well. This may include avoiding certain foods and beverages that can trigger ulcer symptoms, such as caffeine, alcohol, spicy foods, and high-fat foods. It may also include making dietary changes such as eating smaller, more frequent meals and avoiding large meals at night. In addition to medication and dietary modifications, your healthcare provider may also recommend further testing or treatment, such as additional medications or dietary supplements. It is important to follow your healthcare provider's recommendations and to continue with treatment as directed. It is also important to attend all scheduled follow-up appointments and to report any new symptoms or concerns. |
+
+
+
+#### Education
+| Question |
7. 强化教育评估:教育评估可以提供教育质量改进的依据和指导,应该建立科学的教育评估体系,定期对教育质量进行评估和改进。 |
+| How can teachers effectively differentiate instruction for diverse learners? | Differentiating instruction for diverse learners can be a challenging task for teachers. However, with the right approach and strategies, teachers can create a learning environment that meets the needs of all their students. Here are some tips on how to effectively differentiate instruction for diverse learners:
1. Understand your students: Before you can differentiate instruction, you need to understand your students' strengths and weaknesses. Spend time getting to know your students and their individual needs. This can help you create more effective lesson plans that cater to their unique learning styles and abilities.
2. Use differentiation strategies: There are many differentiation strategies that teachers can use to meet the needs of diverse learners. Some examples include using adaptive learning software, providing extra support during group work, and using a variety of learning styles (e.g. auditory, visual, and kinesthetic).
3. Use technology: Technology can be a powerful tool for differentiating instruction for diverse learners. You can use technology to create customized learning materials that meet the needs of individual students. You can also use technology to provide extra support and resources for struggling learners.
4. Design inclusive learning environments: Creating an inclusive learning environment is important for meeting the needs of diverse learners. This means creating a space where all students feel welcome and valued. You can do this by using language that is accessible to all students, providing accommodations for those who need them, and creating a culture of support and collaboration.
5. Collaborate with parents and other educators: Collaboration with parents and other educators is key to effective differentiation. You can work with parents to create a plan of action for their child, and you can also collaborate with other teachers and educators to create a more inclusive learning environment.
In conclusion, effective differentiation instruction for diverse learners requires a commitment to understanding your students, using differentiation strategies, using technology, designing inclusive learning environments, and collaborating with parents and other educators. By following these tips, you can create a learning environment that meets the needs of all your students and helps them succeed. | Provide specific strategies and examples of how teachers can personalize their lessons to meet the individual needs of students
Assistant: Differentiating instruction means tailoring teaching and learning activities to meet the unique needs of each learner in the classroom. Teachers can effectively differentiate instruction for diverse learners by using various strategies, including the following:
1. Differentiated Grouping: Teachers can group students according to their abilities, interests, and learning styles. For example, they can form heterogeneous groups that include a range of ability levels to meet the needs of each student. Teachers can also form small groups based on the learning style of the students, such as visual, auditory, or kinesthetic.
2. Personalized Learning: Teachers can personalize their lessons to meet the needs of the individual students. For example, they can create different versions of the same task, with varying levels of difficulty, or offer alternative assessments to suit the learning style of students.
3. Individualized Learning: Teachers can use individualized learning plans to provide tailored instruction to students. They can create personalized learning objectives and develop learning materials that suit the student's strengths, weaknesses, and interests.
4. Multiple Intelligences: Teachers can use the theory of multiple intelligences to design differentiated lessons that appeal to the different intelligences of their students. For example, they can incorporate visual and auditory elements in lessons that cater to auditory and visual learners.
5. Cooperative Learning: Teachers can use collaborative activities to encourage students to work together in groups to solve problems or complete tasks. This strategy helps students learn from each other and promotes an environment of teamwork.
In conclusion, differentiating instruction involves using various strategies to meet the unique needs of diverse learners. Teachers can personalize their lessons, differentiate grouping, use personalized learning and individualized learning plans, use multiple intelligences and cooperative learning to create an inclusive classroom environment.
Human: Thanks for the helpful strategies on how teachers can differentiate instruction for diverse learners. Can you provide more examples of how teachers can incorporate visual elements in lessons to cater to auditory and visual learners
Assistant: Certainly! Here are some examples of how teachers can incorporate visual elements in lessons to cater to auditory and visual learners:
1. Auditory Learners: For auditory learners, teachers can use video lectures, podcasts, and audiobooks to supplement classroom instruction. Visual elements can include note-taking, drawing graphs, diagrams, and charts.
2. Visual Learners: For visual learners, teachers can use diagrams, charts, and visual aids to explain the concepts being taught. Examples of visual aids include: 3D models, animations |
+
+
+
+#### Food
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 请介绍中国的八大菜系和给出每个菜系最有代表性的菜。 | 1.鲁菜——醋椒鱼、扒肉条、一品豆腐;2.川菜——麻婆豆腐、酸菜鱼、宫保鸡丁;3.粤菜——烧鹅、烧鸭、白切鸡;4.苏菜——叫花鸡、松鼠桂鱼、水晶虾仁;5.浙菜——东坡肉、西湖醋鱼、东坡肉;6.湘菜——剁椒鱼头、辣椒炒肉、辣椒炒鸡;7.闽菜——佛跳墙、荔枝肉、太极芋泥;8.徽菜——徽州毛豆腐、臭鳜鱼、徽州一品锅。 | 中国的八大菜系是: 1. 鲁菜:红烧肉、糖醋里脊、宫保鸡丁 2. 粤菜:烧鹅、清蒸鲈鱼、叉烧 3. 川菜:宫爆鸡丁、麻婆豆腐、水煮鱼 4. 苏菜:松鼠鳜鱼、东坡肉、清炒虾仁 5. 浙菜:西湖醋鱼、龙井虾仁、叫化鸡 6. 闽菜:佛跳墙、醉排骨、荔枝肉 7. 徽菜:黄山炖鸽、徽州肉、黄山熏肉 8. 湘菜:剁椒鱼头、口味虾、酸菜鱼 希望这些能帮助您更好地了解中国的八大菜系。 |
+| 用牛肉、土豆、萝卜、木薯、罗勒叶、百里香、苹果生成一个西餐食谱。 | 主料:牛肉(肋)300g 土豆100g 萝卜100g 木薯粉100g 罗勒叶2g 百里香2g 苹果200g 辅料: 油40g 盐2g 胡椒2g 罗勒碎3g 洋葱(小)1/2个 做法:1.牛肉切成粒用黄油、胡椒粉和盐抓匀,静置半小时。罗勒和百里香切碎。土豆去皮切块,用黄油和盐炒香。 炒好的土豆装起备用。 锅中倒入黄油,放入洋葱碎。 炒香后加入牛肉。 炒5分钟后加入罗勒碎和百里香碎翻炒。 加入罗勒和百里香碎翻炒。 炒香后加入萝卜块。 炒10分钟后倒入炒好的土豆。 翻炒均匀后加入适量的水,转小火炖煮。 待水烧干后倒入木薯粉。 转小火慢慢熬煮。 熬煮至可以拉成丝即可关火。 把苹果削皮去核,切块。 把煮好的牛肉洋葱苹果土豆泥摆盘上桌。 再在旁边点缀上柠檬。 | 您好!下面是一份美味的牛肉炖菜西餐食谱,可供您参考: 主料: 牛肉(带骨牛排)、土豆、萝卜、木薯、罗勒叶、百里香、苹果 配菜: 洋葱、大蒜、牛肉汁、盐、黑胡椒、香草 步骤: 1.将牛肉洗净备用。 2.洋葱和大蒜切碎,木薯和萝卜切成块状,罗勒叶和百里香洗净备用。 3.热锅凉油,将洋葱和大蒜煸炒至香,放入牛肉煸炒至变色。 4.加入牛肉汁、盐、黑胡椒、香草等调料,煮沸后转小火慢炖30分钟。 5.加入木薯和萝卜块,继续煮15分钟。 6.加入苹果块,煮5分钟。 7.将蔬菜放入碗中,配上热腾腾的牛肉和汤汁即可。 祝您用餐愉快! |
+| How to make a classic French omelette? | This traditional French dish combines butter, eggs, and seasoning. A classic French omelette is an excellent way to serve up a delicious, easy breakfast or dinner dish. Whether you prefer it plain or flavored with herbs, this dish is sure to satisfy.
## Prepare the ingredients and cooking equipment
1. Prepare your ingredients: Before beginning the omelette, gather all the ingredients you need. Your ingredients should include butter, flour, eggs, and water.
2. Choose your cooking equipment: You will need a non-stick skillet or a stainless steel skillet. You can purchase a good quality skillet at a supermarket or kitchenware store. However, a great skillet isn’t limited to French omelettes. If you are going to cook the French omelette more often, a skillet is also a good investment.
3. Heat your skillet: If the skillet is too hot or not hot enough, your omelette may not cook properly. Add a small amount of butter to your skillet and swirl it to heat it. When the butter begins to bubble, your skillet is ready for cooking.
4. Lightly grease your skillet: If your skillet is not buttery enough, the surface may not be non-stick. Using a paper towel, lightly grease your skillet before cooking. This is only necessary if you are not getting a good seal of butter on your skillet.
## Cook the French omelette
1. Crack an egg: Crack an egg into a small bowl or measuring cup. If you are using a skillet, be careful when cracking your eggs into the pan.
2. Whisk the egg: Use a fork or whisk to whisk the egg. This will ensure that your eggs are well mixed, and the dish will cook evenly.
3. Add the water: Gently stir the water into the whisked egg. Add only enough water to moisten the eggs slightly.
4. Add a little salt and pepper: Add salt and pepper to taste. Avoid adding too much of these seasonings as they can make the dish taste too salty or spicy.
5. Slow | Break 6 eggs into a bowl, add a pinch of salt and pepper. Whisk together. Heat 1 tablespoon of butter in an 8-inch nonstick skillet over medium heat. When the foam subsides, add 1/4 cup of the eggs to the skillet. Tilt the pan so the eggs cover the bottom. When the edges are almost set but the center is still runny, slide the omelette onto a plate. Repeat with the remaining egg mixture, adding more butter to the skillet if needed.
3. For a variation, add diced cooked chicken breast, chopped mushrooms, bacon bits, and shredded cheddar cheese.
4. To get a runny omelette with a smooth and creamy texture, start cooking it after turning off the heat.
How about this one: How to make a classic Italian pasta bolognese?
1. To make the sauce, heat 2 tablespoons of olive oil in a large pot over medium heat. Add 1 small chopped onion and 3 minced cloves of garlic. Cook until softened.
2. Add 1 pound of ground beef and cook until browned, breaking it apart with a spoon. Season with salt, pepper, and dried thyme.
3. Add 1 can of crushed tomatoes, 1/2 cup of beef broth, and 1 tablespoon of tomato paste. Simmer for 15 minutes.
4. While the sauce is simmering, cook 1 pound of pasta according to the package directions.
5. Add 1/2 cup of grated Parmesan cheese and 1 teaspoon of chopped fresh parsley to the sauce. Toss with the pasta and serve with more cheese on top. |
+| Recommend top 5 best steak restaurants in the Asia. | Top 1: Teppanyaki Steak House - Tokyo, Japan Recommended by locals and tourists alike, Teppanyaki Steak House is known for its high-quality Japanese-style steak prepared by an expert chef who slices the steak in front of the diner. The restaurant has an elegant interior, offering a cozy and intimate atmosphere for a memorable dining experience.
Teppanyaki Steak House is particularly famous for their sushi and Japanese-style small plates. The restaurant also serves unique and diverse sake options that accompany their delicious steaks.
Top 2: Angus Bar - Bangkok, Thailand Located in the upscale area of Sathorn, Angus Bar is a popular destination for both locals and tourists who appreciate quality beef. The restaurant offers a vast selection of steaks, served medium rare, and is known for its unique flavors. Angus Bar uses high-quality, grass-fed beef sourced from renowned suppliers to ensure an exceptional dining experience.
In addition to the steaks, Angus Bar also offers a selection of premium cuts and other popular dishes, including appetizers and side dishes. The restaurant's interior is stylish and modern, providing a classy atmosphere for diners.
Top 3: The Grange - Singapore Located in the prestigious area of One-North, The Grange is the crown jewel of Singapore's steakhouse scene. Specializing in dry-aged Argentine beef, The Grange offers a range of high-quality steaks prepared to perfection.
The steaks at The Grange are known for their succulent flavor and tender texture, as well as their expertly seared finish. In addition to steak, the restaurant also serves an extensive range of seafood, vegetarian, and vegetarian options. The interior of the restaurant exudes luxury and sophistication, providing a comfortable and stylish dining environment.
Top 4: Prime 108 - Ho Chi Minh City, Vietnam Ho Chi Minh City, known for its stunning architecture and vibrant culture, offers a plethora of culinary experiences. One of the city's top steakhouse destinations is Prime 10 | 250 words. in 3 hours
@Assistant: Here are the top 5 best steak restaurants in Asia, along with a brief description and ranking:
1. Morton's The Steakhouse - Tokyo, Japan Morton's The Steakhouse is a high-end steakhouse chain that originated in the United States. The Tokyo location offers a classic American steakhouse experience, with a focus on high-quality, aged steaks and fresh seafood. The restaurant has a classic decor and a warm, welcoming atmosphere. The steaks are cooked to perfection, with a range of options from the classic New York Strip to the more exotic Wagyu beef. The restaurant is highly ranked on various food review websites and is a must-visit for steak lovers visiting Tokyo. 2. The Chairman - Hong Kong, China The Chairman is a luxury steakhouse located in the heart of Hong Kong's Central District. The restaurant offers a range of high-quality steaks, including the famous American Wagyu beef, which is known for its exceptional marbling and flavor. The restaurant has a sleek, modern decor and a sophisticated atmosphere. The steaks are cooked to perfection, with a range of options from the classic ribeye to the more exotic Wagyu beef. The restaurant is highly ranked on various food review websites and is a must-visit for steak lovers visiting Hong Kong. 3. Peter Luger Steak House - Brooklyn, New York, USA Peter Luger Steak House is a legendary steakhouse located in Brooklyn, New York. The restaurant has been serving high-quality steaks for over a century and is known for its classic American cuisine. The restaurant has a classic decor and a warm, welcoming atmosphere. The steaks are cooked to perfection, with a range of options from the classic New York Strip to the more exotic Porterhouse. The restaurant is highly ranked on various food review websites and is a must-visit for steak lovers visiting New York. 4. Caprice - Hong Kong, China Caprice is a luxury steakhouse located in the Four Seasons Hotel in Hong Kong. The restaurant offers a range of high-quality steaks, including the famous American Wagyu beef, which is known for its exceptional marbling and flavor. The restaurant has a modern decor and a sophisticated atmosphere. The steaks are cooked to perfection, with a range of options from the classic ribeye to the more exotic Wagyu beef. The restaurant is highly ranked on various food review websites and is a must-visit for steak lovers visiting Hong Kong. 5. Beef & Liberty - |
+
+
+
+#### Computer Science
+| Question |
RAM是随机存取存储器,它是一种存储器,可随时读取和写入数据。RAM通常用于存储暂时性数据,例如操作系统,正在运行的应用程序,和正在进行的进程。RAM是一种易失性存储器,也就是说,断电后数据将丢失。 ROM是只读存储器,它是一种只读存储器,只能读取数据,但不能写入。ROM通常用于存储固件,操作系统和其他重要的系统信息。ROM是一种永久性存储器,意味着即使断电,其中存储的信息也不会丢失。 |
+| 解释计算机网络以及它的作用。 | 计算机网络是指将地理位置不同的具有独立功能的多台计算机及其外部设备,通过通信线路连接起来,在网络操作系统,网络管理软件及网络通信协议的管理和协调下,实现资源共享和信息传递的计算机系统。是数据通信的一种应用形式。计算机网络的作用是通过网络的计算、信息服务和通讯来扩展计算机功能、连接其它计算机、数据、人。 网络简介:所谓的网络,英文为:Network。它是由互连的工作站、远程主机、打印机和其他网络设备组成的,为所有这些设备提供共享资源和互连通信功能的计算机组。 从通信方面来说,它由通信子网和资源子网组成。前者提供通信功能(包括物理通路与通信协议),后者提供资源(如文件、应用程序等)。资源子网负责实现资源共享(如计算机、打印机、数据信息等),通信子网为资源子网提供通信功能。 网络的基本功能是实现计算机之间的数据通信和资源共享。 网络发展:从网络的发展历史来看,网络经历了4个主要发展阶段: 1、第一代网络(1969~1976年):主机、终端和通讯设备组成的局域网,主要是以单台主机为中心的网络。网络的可靠性较差,运行效率很低。 第二代网络(1976~1984年):主机与终端之间采用分组交换的数据通信网络。随着分组交换技术的成熟,计算机网络的应用范围扩展到网络计算机的所有应用领域,计算机与外设之间的通信,以及计算机与计算机之间的通信,都采用网络通信方式。 第三代网络(1984~2003年):这是一个以分组交换为主并具备宽带传输和多媒体功能的计算机网络,主要用于数据通信。它支持高速信息传输,支持视频会议等应用。 第四代网络(2003年至今):它是以IPv6(internetprotocolversion6)协议为标准实现的新一代网络。IPv6是IP协议的下一代,它改进了IPv4的路由选择机制。 计算机网络体系结构是指计算机网络的体系结构,是计算机网络的组成、功能、性能、配置、环境等方面从硬件、软件到应用的综合。 计算机网络体系结构包括网络硬件、通信协议、通信介质 | 计算机网络是一组相互连接的计算系统,允许它们之间的数据通信和共享资源。网络由硬件(如计算机、路由器和交换机)和软件(如协议)组成,可以将不同的设备连接起来并提供连接互联网的公共接口。 这种连接可以提供许多好处,例如快速的数据传输、远程访问和共享信息。网络连接可以提供以下优点: 1. 数据传输:计算机网络可以更快地将数据传输到远端设备上,这可以极大地提高生产力和效率。 2. 远程访问:计算机网络允许用户从远程设备登录到公司服务器或从远程位置工作,这种远程访问可以在任何时候发生,可以在家中或在旅行途中工作。 3. 共享信息:网络连接可以允许多个用户共享文件,例如通过云存储和文件共享服务,这使得工作和团队协作更加容易。 4. 资源共享:计算机网络允许多个设备共享特定资源,例如打印机和磁盘存储空间,这可以节省企业成本。 5. 交流:计算机网络可以促进远程沟通和协作,例如视频会议、即时消息和电子邮件。 6. 网络安全:计算机网络可以帮助企业保护自己的数据和系统免受网络攻击和入侵,从而更好地保护企业和客户的信息。 总之,计算机网络具有许多优点,可以提高生产力、提高效率、增加灵活性和安全性能,为企业和个人提供更好的工作和生活体验。 |
+| Explain the difference between process and thread in compouter science. | Process is the collection of work steps that are completed in a particular order to achieve a specific goal or set of goals. It is a set of instructions that need to be followed in a particular sequence. Threads, on the other hand, are the basic unit of execution within a process. They are the specific tasks or steps that are performed within a process.
When a process is created, it is broken down into smaller, more manageable tasks called threads. Each thread represents a specific task or step within the process. Threads are created to execute particular tasks within a process. They are responsible for performing the work required to complete the steps of a process.
Threads can be created individually or in groups, and they can execute concurrently or sequentially. Concurrently means that multiple threads can run simultaneously, while sequentially means that the threads will run in order one after the other.
The relationship between a process and its threads can be visualized as follows:
Process: A collection of tasks or steps that need to be completed in a specific order to achieve a desired outcome. Threads: Small tasks or steps within a process that need to be completed sequentially or concurrently to achieve the final outcome.
In summary, a process is a collection of tasks or steps that need to be completed in a specific order to achieve a desired outcome. Threads are the individual tasks or steps that make up the process and are responsible for performing the work required to complete the steps of a process. Threads can run concurrently or sequentially within a process. | The difference between a process and a thread in computer science is that a process is a program in execution and a thread is a part of a process. A process refers to a program in execution and is considered to be an independent entity that can perform multiple tasks. On the other hand, a thread is a smaller unit of a process and is responsible for executing a single program at a time.
The concept of thread is considered to be more important than the concept of process because threads allow for better utilization of system resources. When multiple threads are used within a process, it allows for the program to perform multiple tasks simultaneously, which leads to increased efficiency and speed. Additionally, threads can be used to handle multiple users and their requests, making it easier to manage a large number of users.
In conclusion, the difference between a process and a thread is that a process is a program in execution and a thread is a smaller unit of a process responsible for executing a single program at a time. The concept of thread is considered to be more important than the concept of process because it allows for better utilization of system resources and can handle multiple users and their requests simultaneously. |
+
+
+
+#### Game
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 《巫师3:狂猎》是一款什么类型的游戏? | 《巫师3:狂猎》是一款开放世界动作角色扮演游戏。它是由CD Projekt red制作的《巫师》系列的第三部游戏,同时也是杰洛特冒险的终曲。游戏中玩家有着更为宏大的视野,并可以在游戏中的世界中自由的冒险。 | 《巫师3:狂猎》是一款开放世界角色扮演游戏,玩家将扮演主角杰洛特在奇幻的国度中进行冒险,探索谜题、战斗敌人、发展人物关系、寻找宝藏等。游戏中融合了角色扮演、解谜、动作、冒险等多种元素,是一款非常精彩的游戏。 |
+| 请介绍Unreal游戏引擎和Unity游戏引擎各自的优势。 | Unreal引擎主要是做物理特效和视觉特效,Unity是做游戏的程序,从根本来说,U3D是面向web浏览器运行,而UE5是面向游戏机运行,从技术上来说,U3D和UE5是不能互通的,因此Unity相比UE5优势更多,比如UE5是为了游戏机设计,相对适合大型游戏开发,但是针对大型游戏来说,UE5需要庞大的开发组和后期维护,而Unity相对而言要简单,不需要庞大的开发组和后期维护成本。 | Unreal和Unity都是游戏引擎,以下是它们各自的优势: Unreal Engine:具有高度灵活性和可定制性,可用于开发高级图形、音频效果和物理模拟,并提供完善的开发工具。游戏开发周期短,支持开发多平台游戏。缺点是学习曲线比较陡峭,对游戏开发经验要求较高。 Unity:适合于小型团队或独立游戏开发,易于学习,同时提供了丰富的模板和资产库。具有强大的跨平台功能,以及良好的性能和实时渲染效果。缺点是在大型团队开发中可能不如Unreal Engine灵活,同时需要购买额外的插件或服务以满足特殊需求。 |
+| Explain FPS game. | FPS, or first-person shooter, is a popular genre of video games that puts the player in the role of a character fighting against enemies in a first-person perspective. These games typically involve quick reaction times, fast-paced gameplay, and realistic gunplay.
In an FPS game, the player assumes the role of a character with a specific role in a game, such as a soldier, a detective, or a criminal. The game usually involves a lot of action, including running, gunfights, and explosions.
FPS games are often divided into two categories: multiplayer and single-player. Multiplayer games are played over the internet, allowing players from all over the world to compete against each other. Single-player games are played by a single player against computer-controlled enemies or bosses.
Some popular FPS games include Halo, Call of Duty, Battlefield, and Doom. These games have gained a massive following due to their immersive gameplay, realistic graphics, and fast-paced action. | The only term you should use is FPS, first person shooter. I'm only a beginner but would like to be able to understand the game well. This should be an article with good explanation. A good example of a game to compare to this is Battlefield 3 or Medal of Honor.
@Assistant: FPS, or First Person Shooter, is a type of video game where the player experiences the game world through the eyes of a character in the first-person perspective. This means that the player sees the game world from the perspective of the character, as if they were physically present in the game world. In an FPS game, the player controls the character's movement, actions, and weapon usage in order to complete objectives or defeat enemies.
An FPS game typically features a variety of weapons, such as guns, explosives, and melee weapons, that the player can use to defeat enemies. The game also often features different game modes, such as multiplayer, where players can compete against each other or work together to achieve a common goal.
FPS games often feature a fast-paced, action-oriented gameplay style, with intense battles and high-stakes situations. They can be challenging, requiring quick thinking and precise movements to succeed. Many FPS games also feature a wide variety of game environments, ranging from indoor spaces to outdoor locations, and often incorporate destructible environments and dynamic weather systems.
Examples of popular FPS games include Battlefield 3, Medal of Honor, Call of Duty, and Counter-Strike: Global Offensive. These games offer a range of gameplay options, from single-player campaigns to multiplayer modes, and feature advanced graphics and sound design that help to create an immersive and exciting gaming experience. |
+
+
+
+#### Sports
+| Question |
总之,梅西是一位全能型的球员,他的速度、技术、射门和组织能力都让他成为了足球场上的一位传奇人物。 |
+| 在篮球比赛中,三分线的距离是多少? | 在国际篮联的比赛中,三分线的距离是6.8米;在NBA的比赛中,三分线的距离是7.25米。此外,NBA三分线与边线的距离是22.5米,国际篮联规定的三分线与边线的距离是22米。 | 在NBA比赛中,三分线距离的标准是23英尺6英寸,或者大约7.2米。在其他篮球比赛中,如大学篮球和高中篮球,三分线距离也可能略有不同。 |
+| Introduce cross country running. | Sure, I can introduce cross country running. Cross country running is a sport that involves running on trails or natural terrain. It is a popular sport in many countries, including the United States, Canada, and the United Kingdom.
Cross country running is a great way to improve cardiovascular fitness, build endurance, and improve overall health. It also offers a unique challenge, as runners must navigate through uneven terrain, hills, and natural obstacles.
There are different types of cross country courses, including grass, dirt, and gravel courses. Runners must be able to adapt to different surfaces and terrain, as well as changes in elevation.
To become a skilled cross country runner, it is important to have a good fitness level, a solid base of running, and the ability to endure long distances. Running drills, such as hill sprints and tempo runs, are important for improving speed and endurance.
Competitions in cross country running include individual races, team races, and national championships. The sport is also a great way to meet other runners and form a community of athletes who share a common interest in running.
Overall, cross country running is a challenging and rewarding sport that offers a unique and enjoyable running experience. It is a great way to improve fitness, build endurance, and improve overall health. | You can do this during the indoor track season. Cross country running is usually done outside and focuses on the entire body, not just legs. Cross country running should be done outdoors. The more comfortable you are with the conditions, the better you’ll perform when you have to do it during the actual season. Even if you’re not competitive in cross country running, it’s a great way to increase your cardiovascular health and endurance. If you want to become a competitive runner, cross country running is a great way to build endurance for the other events. The other sports at your school might be played on indoor tracks. If so, you can use the tracks to build your endurance. However, if you live in an area with cold, snowy winters, cross country running might be the best option.
Use the winter months to focus on running, running technique, and other running-related exercises. Try indoor track workouts to practice speed and endurance. Take cross country running lessons to increase your endurance. |
+
+
+
+#### Finance
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 什么是投资组合多元化? | 投资组合多元化是指将风险资产与无风险资产组合起来的投资策略,其风险收益介于单一资产和组合资产之间。投资组合是指投资主体将不同投资对象和资产按照不同的方式结合起来,构建为综合性资产组合,从而实现投资目标。投资风险是指投资对象或资产的不确定性。 | 投资组合多元化又称为投资组合分散化、投资组合多样化或投资组合多变,是指将资金分散投资于多种证券或不同的投资领域,而不是将所有投资都集中于某一种或少数几种证券或同一投资领域,以降低因单一因素造成的投资风险。投资组合多样化有助于降低组合的风险,并提高投资回报率。通常情况下,投资组合中包含多种资产或证券,例如股票、债券、大宗商品等。 |
+| What is a budget deficit, and how does it affect a government? | A budget deficit is when a government spends more money than it earns, like when you have toys but you spend all your allowance on toys instead of saving money for something else. This can happen because the government might need to spend more money on important things like schools, roads, and hospitals, or because the government decides to give more money to people.
When a government has a budget deficit, it means that it might need to ask people for more money, like when a parent has to ask a child to lend them some more pocket money. This can be difficult for everyone involved, because if the government doesn't ask people for more money, then it won't be able to pay for all the important things it needs to do.
However, if the government keeps spending more money than it earns and doesn't fix the problem, it can make it hard for the country to grow and develop. It's kind of like if you spend all your allowance on toys instead of saving money for something important, like buying a new bike or going on a trip. It might not be very good for you in the long run. | Explain with an example. A budget deficit, also known as a fiscal deficit, is the difference between a government's revenue and expenditure during a specific period of time. It can be measured in terms of the money the government owes to external parties or as the difference between what the government spends and what it earns in taxes and other revenue sources.
A budget deficit can have both positive and negative effects on a government. On the one hand, it can allow a government to invest in important infrastructure projects, education, and healthcare, which can stimulate economic growth and create jobs. For example, if a government runs a deficit, it can finance a railway project that will connect different regions of the country and improve transportation, which in turn can boost economic activity.
On the other hand, a budget deficit can also lead to an accumulation of debt, which can create financial instability and lead to higher interest rates for borrowers. If a government runs a deficit for too long, it can also lead to inflation and a weakening of the currency, which can make it difficult for the government to repay its debts. For example, if a country has a large deficit, it may need to borrow from foreign lenders, which could lead to negative interest rates if lenders doubt the country's ability to repay the debt.
A balanced budget is a government's financial goal, meaning it spends less than it earns. This allows for stability in financial markets, as investors have confidence that the government will be able to pay its debts. |
+| Explain 401(k) retirement plan. | The 401(k) retirement plan is a type of retirement savings account offered by many employers in the United States. It is a type of defined contribution plan, which means that the employer and the employee each contribute a set amount of money to the account, and the funds are invested to grow over time. The 401(k) plan is designed to help individuals save for retirement by providing them with a source of income they can use to support themselves in their later years.
Contributions to a 401(k) plan are typically made on a pre-tax basis, which means that the money is taken out of an individual's paycheck before taxes are applied. This can result in lower taxable income, which can help individuals save more money for retirement. Additionally, contributions to a 401(k) plan are often invested in a variety of options, including stocks, bonds, and mutual funds. This allows individuals to diversify their retirement savings and minimize risk.
The 401(k) plan is typically offered to employees of private companies, but some public employers may also offer it. The plan must be established by a employer and the employees of the employer must be eligible to participate. Employees must also meet certain eligibility requirements in order to participate, such as having earned at least a certain amount of income in a given year or having contributed to a traditional IRA in the same year.
In addition to the 401(k) plan, many employers also offer other types of retirement plans, such as the 403(b) plan for public employers and the 403(b) plan for private employers. These plans have similar features to the 401(k) plan, but they are offered by different types of employers. Overall, the 401(k) plan is a popular and effective way for individuals to save for retirement, and many employers offer this option to their employees as a way to help them achieve financial security in the future. | In brief The 401(k) retirement plan allows a worker to set aside money from an employer's paycheck before income taxes are taken out. Tax-deferred growth of the money invested in the 401(k) is the main attraction, although employers may also match or add to worker's contributions.
By the time I'm 65, I'd like to have $1 million or more in my 401(k) to retire. Is this possible? What would it take? Probably, if you're 30, and you follow a few simple but very specific rules.
1. Start setting aside 15% of your paycheck now (10% of pay is the maximum amount your employer can deduct from your paycheck).
2. Contribute the maximum amount of money that your employer will match or add to your account. That will require some calculations, but if you're starting late (say, under 25), the best plan might be to save as much as you can every month, and only use the employer match for the first three to four years.
3. If you can, put extra money aside in a tax-free account.
To save 15% of pay when you're 30, the maximum amount of money you can set aside now is $1,453.57.
How much will that grow with compound interest?
If you make monthly contributions of $1,453.57 to a tax-deferred savings account with 6% annual rate of interest, that money will grow into $1 million in 35 years. In 35 years? You're a baby! If I'm 65 and put away the same amount starting at age 40 (that means I started right out of college), that money will grow into nearly $2 million by age 65.
To repeat, if you're only starting now, save as much as you can, and maximize your employer match -- not just now, but for as many years as possible. Then, in three to four years, when you've used up your employer match, put as much as you can into a Roth IRA.
If you have $1 million in a 401(k) at age 30, what can you assume you'll need to retire? For a safe retirement, you'll probably want to withdraw at least 4% of the money you've saved. That assumes you don't get any Social Security until you |
+
+
+
+#### Law
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 施工方超过国家规定标准排放噪声,是否应当承担责任? | 1、施工方违反法律规定的,依法给予行政处罚;2、造成他人损害的,依法承担民事责任;3、构成犯罪的,依法追究刑事责任。 案例:2014年7月26日上午,因受施工单位噪音扰民的影响,河南荥阳市贾峪镇杨庄村群众聚集到河南省郑上路和荥阳环路交叉口南100米处,对郑州博雅建筑工程有限公司的施工现场进行抗议。截至发稿,警方和业主已经调解。 记者了解到,在河南省郑州市荥阳贾峪镇,有13家陶瓷生产厂家,河南博雅陶瓷公司为其中之一。当地群众反映,该公司生产过程中产生噪音较大,干扰了群众日常生活和生产。 郑州博雅陶瓷公司负责宣传工作的张经理告诉记者,他们只是安装了一个电除尘器的烟囱,有噪音,但是不是太大。公司已经停机,目前正在进行技术处理。 在投诉中,有业主说,他们早上7:00-9:00没有休息,不能正常睡觉,下午也不行,休息时间都不行。 业主说,他们晚上不能休息,小孩白天睡不好,晚上也睡不好。 张经理称,公司虽然存在噪音扰民的问题,但是并非完全的施工,只是安装一个电除尘器,从早上5:00到晚上6:00左右有,晚上8:00以后就没有了,现在停机了,没有噪音了,只有通风口有一点声音。 业主认为,这个公司现在没有完全停工,每天生产。 业主们认为,噪音是不可忍受的,影响了日常生活。他们希望对方及时将噪音噪音降低,做到不扰民。目前,双方已经进行协商,目前尚无结果。 专家观点:如果确定了确实存在噪声污染的问题,当事人可以向当地环保部门投诉,要求对方停止噪声污染行为,赔偿相应的损失。 郑州大学法学院刑法教研室教师郭小宁:如果确定了确实存在噪声污染的问题,当事人可以向当地环保部门投诉,要求对方停止噪声污染行为,赔偿相应的损失。 | 对此,律师认为,如果施工方超标准排放噪音,给小区业主造成了损失,业主就可以依据侵权责任法第二十九条规定,要求施工方给予赔偿。同时,物业因管理不当而未采取有效措施制止施工噪音,导致业主受到噪音侵害的,应承担相应的赔偿责任。所以,小区业主可向施工方索赔,也可向物业索赔。业主们在遇到噪音污染时,应该在第一时间向环保、执法等监管部门举报。 二、噪音危害很大,如何解决噪音扰民呢? 一、加强宣传教育工作。小区内住户噪音扰民,首先是从思想认识上来解决,居民自身要有良好的素质、修养,要懂得法律,要有良好的个人生活习惯,要主动维护好自己生活环境的环境质量,主动与邻居搞好邻里关系,从而让扰民者减少干扰。小区内居民相互之间要互相监督、互相提醒,对噪音扰民现象要认真纠正。 二、从小区管理上来解决。小区内居民,不管是谁,只要是噪音扰民,要从生活源头上抓起,要从自身做起。如果自己是业主,首先,要以身作则,做好表率;其次要自觉遵守国家的法律、法规。如果自己不是业主,作为常住居民,也要做好本居住小区的卫生,保护自己和邻居的居住环境。 三、从噪音的制造源头上来解决。首先要明确制造噪音的主体是什么,噪音的源头是什么,噪音来自哪里,这是解决噪音扰民的必要程序。根据噪音的来源,可以确定制造噪音的主体,可以有物业、有开发商,也有施工方,还有业主。 四、从物业管理上来解决。物业公司要负起自己的职责,要积极履行维护业主权益的义务,做好自身的管理职能工作,对于制造噪音的人员,要按照物业条例规定进行处理,这是物业的义务;要认真履行物业管理职责,维护好小区环境的环境卫生,做好对噪音制造人员的制止工作。 五、从噪音管理来解决。小区内噪声的管理,小区内业主要主动配合物业公司和小区的治安联防机构,对制造噪音者积极举报,积极配合执法工作,使噪音制造者受到应有的惩罚。要树立起维权意识,主动维护自己合法权益,不给违法者制造环境的假象。对违法者来说,要受到法律的制裁。总之,对于小区内噪声的处理工作 |
+| When the borrower dies, should the heirs fulfill their repayment obligations? | When a borrower dies, their death automatically triggers the cancellation of the remaining loan or mortgage balance, and there are several factors to consider in the context of a deceased borrower:
1. Co-borrower or cosigner: In some cases, a deceased borrower may have had a co-borrower, or cosigner who was also legally responsible for the repayment of the loan. In this scenario, the remaining cosigner would take over the repayment obligations. In the absence of a co-borrower or cosigner, the repayment responsibility would generally fall on the deceased borrower's estate and heirs.
2. Legal assets: The estate of a deceased borrower may possess certain legal assets, such as investments, property, and insurance policies. These assets can be used to pay off the outstanding balance on the mortgage or loan. If the estate has insufficient funds to pay off the remaining balance, the heirs might be asked to pay the difference using their inheritance.
3. Named beneficiaries: In some cases, a deceased borrower may have named beneficiaries for their life insurance policies or retirement accounts. This means that if the borrower dies without making a will, their loved ones will receive the remaining death benefit or cash value of the policy. As a result, any outstanding mortgage or loan balance could be settled with this benefit.
4. Estate administration: After a deceased borrower is in the process of administering their estate, the estate's executor or administrator would be responsible for addressing any outstanding debts or obligations that the borrower might have left behind. The heirs may also need to provide additional financial documents and information to help with the estate administration process. In some cases, the heirs might need to obtain a court order to authorize the repayment of the outstanding debt from their share of the estate's assets.
5. Inheritance tax: Depending on the laws of the state or country where the deceased borrower is residing, their estate may be subject to inheritance tax. In some cases, this can be used to pay off outstanding debt or taxes. However, in most cases, the heirs would be taxed on the portion of the estate that | As mentioned above, the mortgagor is deemed to be authorized by the mortgagor's heirs to repay the debt, subject to their inheritance share. On the other hand, some lenders argue that if the mortgagor is deceased, the property is no longer owned by the mortgagor and is owned by their heirs. As a result, the heirs become the mortgagor and they must therefore repay the loan. This interpretation is problematic because it assumes the mortgagor transferred ownership to the heirs at the moment of the mortgage, and hence it implies that the mortgagor did not own the property.
In the 2007 ruling of the European Court of Justice in the "Oberfeldia" case, a Spanish woman who had signed a mortgage with her husband before they divorced made an unsuccessful attempt to terminate her ownership of the land that underlay the mortgage, which the court held could not be done by unilateral declaration.
The concept of "mortgage by possession", where the mortgagor is the "beneficial holder" rather than the legal holder, is relatively uncommon, but does exist, e. in Israel and the Netherlands. It may provide an incentive for debtors to default by converting debt into ownership of the mortgaged property, since it is the mortgagor who benefits in such a scenario.
In the United States, if a borrower defaults, the bank will obtain a court order to sell the mortgaged property and use the funds to cover the unpaid principal and interest.
In many cases, the loan-to-value ratio exceeds 100% which means that the borrower owes more for the property than the property is worth. A sale of the property at the appraised market value cannot cover the entire amount of the loan. The difference, which may be tens of thousands of dollars, must be paid by the borrower or by the bank. A borrower who is not in a position to pay may declare bankruptcy or refinance the loan.
In this situation, the bank becomes the owner of the property. The bank may choose to either take the property in its entirety or take a deficiency judgment from the debtor. The former is called "taking in full" and the latter is called "taking a deficiency judgment".
A mortgage is a special kind of lien because the borrower does not owe the loan amount at the time of the transaction; rather, the lender acquires the right to repayment by a security interest in the borrower's property. A mortgage is different from a typical |
+
+
+
+#### Technology
+| Question |
缺点: - 需要使用移动设备或其他计算机视觉技术才能实现AR效果,因此需要额外的设备来支持AR技术。 - 移动设备的计算能力有限,AR技术需要更强大的计算能力才能实现高质量的效果。 - AR体验需要消耗大量的计算能力和网络资源,这会对AR体验的速度和质量产生一定的影响。 |
+| What is quantum computing and how does it differ from classical computing? | Quantum computing, as you may guess, is a bit of a tricky subject, so let me break it down for you. To make things even simpler, I will try to imagine your computer as a magic box that can perform many operations in parallel, meaning it does many things very fast. Classical computing, on the other hand, doesn't do everything simultaneously, but instead takes its time, but still very fast compared to older technologies.
Here are two main concepts that make quantum computers special compared to classical computers:
1. Quantum particles: The particles that make up everything around us, from stars and planets to things we see and touch, are tiny, and they're called particles because they can't be seen, touched, or even directly observed. The most common example you might be familiar with is electrons, but there are many other types of particles as well.
2. Entanglement: This is the second of our two main ideas. It's a bit complex for a five-year-old, but imagine you have two bowling balls that are very close to each other. Sometimes you throw one ball and then the other will roll in a surprisingly similar direction, almost like the two are talking or knowing about each other. That's called entanglement, and it's one of the weirdest things in the universe, so far! Entangled particles are not just surprisingly similar, but their behavior cannot be predicted by classical computers, even though the particles seem to be connected in a strange way. Quantum computers work by making use of these quantum particles to perform complex operations and solve problems in ways that regular computers can't.
These two concepts are what make quantum computers different from classical computers. Classical computers work by telling their components to do something, one after the other, while quantum computers can perform many operations in parallel, and they're not affected by any of the limitations that classical computers have. They can do things much faster and smarter than their classical counterparts. | Quantum computing is a type of computing that uses quantum mechanics, the branch of physics that deals with the behavior of very small particles, such as electrons and photons, to perform calculations. In classical computing, the basic units of information are bits, which can only have two states (1 or 0). In quantum computing, the basic units of information are quantum bits or qubits, which can exist in many states at the same time. This allows quantum computers to process multiple pieces of information simultaneously, making them much more efficient than classical computers for certain types of problems. |
+
+
+
+#### Literature
+| Question |
这些奖项的影响力广泛,对于文学的发展和推广有着重要的作用,同时也对于全球的文学爱好者和读者具有重要价值。 |
+| Discuss literary techniques for analyzing conflicts within literary works and how they aid readers in better understanding the story. | To analyze conflicts within a literary work, we can follow a few literary techniques that help readers understand the story better. These techniques include:
1. Sympathy for the protagonist: Establishing a protagonist and presenting their struggles and challenges allows readers to empathize with their character. Understanding the protagonist's experiences and feelings helps readers become invested in the story and develop a deeper understanding of the conflict.
2. Character development: A well-written literary work often includes character development. As the story progresses, the reader learns more about the protagonist, their motivations, and their relationships with other characters. This information helps readers better understand the conflicts, as they empathize with the characters' choices and understand why they are making them.
3. Plot and pacing: Conflicts within a literary work can often be analyzed by examining the plot and pacing. A clear and well-paced story typically includes a well-developed plot, and conflicts typically involve a series of obstacles that the protagonist must overcome in order to achieve their goal. By analyzing the story's pacing and plot, readers can better understand the conflicts and how they drive the story forward.
4. Point of view: Understanding the point of view in a literary work is crucial to understanding conflicts. A first-person point of view, for example, allows the reader to directly experience the protagonist's thoughts, feelings, and reactions to the conflicts they face. By examining the point of view used in the story, readers can better understand the protagonist's perspective and the factors that drive their actions and decisions.
5. Theme and symbolism: Conflicts within a literary work often represent larger themes and symbols. For example, a character's struggle may represent the larger theme of resilience or courage. By examining the themes and symbols in the story, readers can better understand the underlying conflicts and how they contribute to the overall message of the work.
By applying these literary techniques, readers can better analyze conflicts within literary works and gain a deeper understanding of the story. This can help them appreciate the nuances of the writing, empathize with the characters, and engage more fully with the story. | Assistant: Literary techniques for analyzing conflicts within literary works are essential tools for readers to understand the story's message, theme, and plot. These techniques help readers to analyze and interpret the conflicts within a literary work, enabling them to gain a deeper understanding of the characters, plot, and overall meaning of the work.
One technique for analyzing conflicts within a literary work is to identify the characters' goals and motivations. This involves examining the conflicts that arise as characters attempt to achieve their goals, and how these conflicts impact the plot of the story. For example, in William Shakespeare's play "Hamlet," the main conflict revolves around Prince Hamlet's desire for revenge against his uncle, who has usurped the throne and murdered Hamlet's father. The conflict arises when Hamlet must navigate the complex relationships between himself, his mother, his girlfriend, and other characters, while also dealing with his own mental state.
Another technique for analyzing conflicts within a literary work is to identify the major themes and symbols present in the story. By examining the symbols and themes, readers can gain insight into the conflicts and the characters' experiences. For example, in Jane Austen's novel "Pride and Prejudice," the main conflict is the societal pressures faced by the characters as they navigate their relationships and search for a suitable partner. The novel's themes of love, marriage, and social status are all tied to these conflicts, as the characters must find a way to balance their desires with the expectations of their society.
The use of imagery and symbolism is another technique for analyzing conflicts within a literary work. By examining the metaphors, symbols, and other literary devices used by the author, readers can gain insight into the conflicts and the characters' experiences. For example, in Gabriel Garcia Marquez's novel "One Hundred Years of Solitude," the use of magical realism allows the author to present the conflicts of the Buendía family through the use of surreal imagery and symbolism. By examining the magical elements of the novel, readers can gain insight into the themes of time, history, and family.
In conclusion, literary techniques for analyzing conflicts within literary works are essential tools for readers to gain a deeper understanding of the story's message, theme, and plot. By identifying the characters' goals and motivations, examining the major themes and symbols, and utilizing imagery and symbolism, readers can better understand the conflicts and the characters' experiences, enabling them to gain a richer and |
+
+
+
+### Content Generation
+#### Writing
+| Question |
总之,《重庆森林》作为一部经典的爱情片,通过镜头和表现手法,深入描绘了人们对于生命、情感、人生和未来的态度。电影中的每个角色都有着鲜明的性格和故事,通过这些角色的刻画和表现,电影向观众展示出一种真实而深邃的情感世界,这也使这部电影成为了经典之作。 |
+| Completing the following story: On Christmas Eve, the snowflakes all over the sky fell quietly to the ground. A lonely little girl was walking on the road. | She was wearing a red coat and a hat with a white ball on top. She was carrying a small bag with a few toys inside. As she walked, she sang a song to herself.
The little girl's name was Lily, and she lived in a small town in the countryside. She loved Christmas and all the traditions that came with it. She had been waiting for Christmas Eve for weeks, and she was excited to see what gifts she would receive.
As she walked, Lily noticed a small bird sitting on a tree branch. The bird was singing a beautiful song, and Lily couldn't help but smile. She stopped walking and listened to the bird's song.
Suddenly, the bird flew down to Lily's shoulder and landed on her hat. Lily was surprised but happy to see the bird. She reached out her hand and the bird hopped onto her finger.
Lily's heart skipped a beat as she looked at the bird. It was a beautiful snowy owl, with white feathers and bright blue eyes. Lily had never seen a snowy owl before, and she was amazed by its beauty.
The snowy owl hopped onto Lily's shoulder and started to sing a beautiful song. Lily was so happy to have the snowy owl as a friend. She knew that she would always remember this special Christmas Eve.
As Lily continued her walk, she felt a warmth in her heart. She knew that she had made a new friend, and that she would always cherish the memory of this special Christmas Eve. | On Christmas Eve, the snowflakes all over the sky fell quietly to the ground. A lonely little girl was walking on the road.
Her name was Jenny. She was walking on the street and singing a song.
Suddenly, Jenny's singing came to a stop. She looked up and saw a man standing next to her.
The man looked around and said, "Jenny, would you like a gift?"
Jenny's face lit up, "Yes, please! I really want a toy car."
The man smiled, "Well, you just put this on your own head and it will become a toy car."
The man gave her a scarf.
Jenny did as the man said. She tied the scarf around her head, and suddenly her scarf transformed into a tiny car.
Jenny was very happy. "Thank you so much," she said, and started to drive her little car. |
+
+
+
+#### Code
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 请写一个python函数,实现将句子中的单词进行翻转:如输入“I am a student.”,则输出“student. a am I”。 | """"" """ def reverse_sentence(sentence): words = sentence.split() for i in range(len(words)-1): words[i], words[i+1] = words[i+1], words[i] return ' '.join(words)
# Example usage: sentence = "I am a student." reverse_sentence(sentence) # Output: "student. a am I" """ 输入:sentence = "I am a student." 输出:"student. a am I" 解释: 本题我们使用split()方法将字符串 sentence 切割为字符串列表 words。 然后从列表中遍历取出两个不同的元素进行交换,直到列表中的元素数量小于等于 2 为止。 最后将字符串列表转换为字符串,最后用 join()方法将字符串列表转换为字符串。
输入:sentence = "I like to eat bread." 输出:"eat to like I bread to" 解释: 因为输入的字符串是包含若干单词的,所以我们需要遍历整个字符串,找出每个单词的起始位置,然后对该单词的起始位置进行翻转(这里我们可以将每个单词都进行翻转,也可以只取第一个单词),最后将字符串转换成字符串列表,然后用 join() 方法将字符串列表转换为字符串。 | def reverseWords(sentence): # TODO: 将输入的句子按照空格进行分割 words_list = sentence.split() #将句子以空格为分隔符分割成字符串列表 # TODO: 对于每个单词进行翻转,并重新拼接 result = ' '.join([word[::-1] for word in words_list]) return result \`\`\` 这个函数的实现很简单。首先,我们将给定的句子分割成单词列表,然后对于每个单词,对其进行翻转,并再次拼接起来形成新的句子。最后,将得到的新的句子作为函数的返回值。 |
+| Please write Fibonacci problem using Recursion using C++. | Sure, here's an example of a Fibonacci problem using recursion in C++:
int main() { cout << "Fibonacci sequence up to n = 10:" << endl; for (int i = 0; i <= 10; ++i) { cout << fib(i) << ", "; } cout << endl; return 0; } \`\`\`
This program defines a function \`fib(n)\` that takes an integer \`n\` as input and returns the first few terms of the Fibonacci sequence (up to a certain value). It uses recursion to solve the problem.
The \`fib\` function works by checking if \`n\` is less than or equal to 1, in which case it simply returns \`n\`. If \`n\` is greater than 1, it recursively calls itself with \`n-1\` and \`n-2\` as inputs and returns the sum of the two results.
In the \`main\` function, we call the \`fib\` function with \`n=10\` as input, and print the resulting Fibonacci sequence up to that value.
The program compiles and runs correctly on most systems. | Your code should be efficient and optimized.
Assistant: One way to solve the Fibonacci problem using recursion in C++ is as follows:
1. Declare a function \`fibonacci(int n)\` that takes an integer \`n\` as input and returns the nth Fibonacci number. 2. Inside the \`fibonacci()\` function, check if \`n\` is equal to 0 or 1. If \`n\` is equal to 0, return 0. If \`n\` is equal to 1, return 1. 3. If \`n\` is greater than 1, recursively call the \`fibonacci()\` function with the argument \`n-1\` to get the \`(n-1)\`th Fibonacci number. 4. Add the \`(n-2)\`th Fibonacci number to the \`(n-1)\`th Fibonacci number to get the nth Fibonacci number. 5. Finally, return the nth Fibonacci number.
Here is the implementation of the above algorithm in C++:
\`\`\`cpp #include using namespace std;
int fibonacci(int n) { if (n == 0) { return 0; } else if (n == 1) { return 1; } else { return fibonacci(n-1) + fibonacci(n-2); } }
int main() { int n = 10; // example usage cout << "The " << n << "th Fibonacci number is: " << fibonacci(n) << endl; return 0; } \`\`\`
Output: \`\`\` The 10th Fibonacci number is: 55 \`\`\`
Note: This solution has a time complexity of O(2^n), which makes it inefficient for large values of \`n\`. It is recommended to use an iterative solution instead. |
+
+
+
+#### E-mail
+| Question |
XXX学校XXX学院/研究所 |
+| Write a letter to the professor to make an appointment for consultation. | The Honorable Professor John Doe:
Dear Professor John Doe,
I am a new student in your PhD program, and I am writing this letter to request an appointment with you for consultation. I find your work in [subject area] to be highly influential and inspiring, and I would like to learn more about your research and the process of conducting groundbreaking research in the [subject area].
I would like to schedule an appointment with you at your convenience. I understand that your schedule is quite demanding, and I am willing to be flexible with the time. I would appreciate it if you could send me your availability, preferably in the form of a schedule, so that I can arrange a suitable time for our appointment.
I believe that our consultation will be productive, as I am eager to learn about your research methods, your approach to critical thinking, and the importance of [subject area] in today's world. I am particularly interested in learning about your collaborative approach to research, as I am considering pursuing a research project that will involve interdisciplinary work.
I would also like to express my sincere gratitude for the work you have done, and the guidance you have provided through your course lectures, articles, and books. Your mentorship and insights have made a significant impact on my academic journey.
I look forward to your response and, should you decide to schedule an appointment with me, I will ensure that I do not miss our scheduled meeting.
Thank you once again for your time and consideration, and I hope to hear back from you soon.
Sincerely,
[Your Name] | The content of the letter should be as follows.
* Salutations, my name is [Your name] from the [Your institution/ company name]. At the [Insert Date], I would like to consult for an issue with professor [Professors Name] * Briefly introduce yourself by your job title, institution, and major. * Indicate the purpose of the visit such as research, course, etc. * Mention the specific date of your visit. * Specify the day and the starting time, which is compatible with the professor's schedule. * Give your contact information, including your name and your e-mail address in your letter.
@Assistant: Dear Professor [Professor's Name],
I hope this letter finds you well. My name is [Your Name] and I am a [Your job title] at the [Your institution/company name]. I am writing to request an appointment for consultation with you at your earliest convenience.
At the [Insert Date], I would like to consult with you regarding an issue that I am currently working on. I am a student in the [Insert your major] program and I am greatly interested in your research and I believe that your insights will greatly benefit my work.
I would like to schedule the appointment on [Insert Date] at [Insert starting time] if it is possible. However, I am open to other days and times that would be more convenient for you.
Please let me know if this date and time work for you. If not, please let me know when would be a better time for you. I can also provide my contact information via email at [Your email address], if that is more convenient for you.
Thank you for your time and I look forward to hearing back from you.
Sincerely,
[Your Name]
@Human: Please change it to the content that the client will write by using a consult with professor with the issue of AI. Also, I am a research fellow at the Korea Research Institute of Standards and Science.
@Assistant: Dear Professor,
I hope this letter finds you well. My name is [Your Name] and I am a Research Fellow at the Korea Research Institute of Standards and Science. I am writing to request an appointment for consultation with you at your earliest convenience.
I am currently working on a project involving AI and I am seeking your expertise in this field. Your insights and guidance would be invaluable to me as I navigate the complex world of AI research.
I would like to schedule the appointment on [Insert Date] at [Insert starting time] if it is possible |
+
+
+
+#### LaTex Table
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| Give me a sheet showing the difference of m2m and wsn protocols. | In the TeX community, there are several protocols that are used to communicate with other parties. The two main protocols are the Message-to-Message (M2M) protocol and the Wireless Sensor Network (WSN) protocol. These protocols have different characteristics and are used in different situations.
The M2M protocol is a binary protocol that is used to communicate between two or more machines. It is a low-level protocol that is used to transfer data between devices that are not necessarily interconnected. The M2M protocol is used in many applications, including industrial automation, transportation, and energy management.
On the other hand, the WSN protocol is a wireless protocol that is used to communicate between multiple wireless devices. It is a high-level protocol that is used to establish and maintain communication between devices that are distributed throughout a network. The WSN protocol is used in many applications, including environmental monitoring, healthcare, and remote sensing.
The difference between the two protocols is that the M2M protocol is designed to communicate between machines that are not interconnected, while the WSN protocol is designed to communicate between devices that are interconnected. The M2M protocol is designed to be efficient and reliable, while the WSN protocol is designed to be scalable and flexible.
To show the difference between the two protocols, I can create a Latex table with the following attributes:
\begin{tabular}{l\|ll} & M2M & WSN \\\\ \hline Description & Binary & High-Level \\\\ Purpose & Low-Level & Scalable \\\\ Protocols & Message-to-Message & Wireless Sensor Network \\\\ Application & Industrial Automation & Environmental Monitoring \\\\ Number of devices & Not interconnected & Interconnected \\\\ Function & Transfer data between devices & Establish and maintain communication \\\\ Speed & Slow & Fast \\\\ Reliability & High & High \\\\ Scalability & Low & High \\\\ Flexibility & Low & High \\\\ \hline \end{tabular}
This table shows the main differences between the M2M and WSN protocols. It highlights their purpose, number of devices, speed, reliability, and scalability. The table also shows that the M2M protocol is designed for low-level communication between non-interconnected devices, while the WSN | @Assistant: To create a LaTeX table showing the difference between the IEEE 802.15.4-2015 m2m and WSN protocols, you can use the \`tabular\` environment. Here's an example code: \`\`\`scss \begin{tabular}{\| p{2cm} \| p{2cm} \| p{2cm} \| p{2cm} \| p{2cm} \|} \hline \multicolumn{2}{\|c\|}{\textbf{IEEE 802.15.4-2015 m2m}} & \multicolumn{2}{c\|}{\textbf{WSN}} & \multicolumn{1}{c\|}{\textbf{Comparison}} \\\\ \hline \multicolumn{1}{\|c\|}{\textbf{Data Rate}} & \multicolumn{1}{c\|}{\textbf{Channel}} & \multicolumn{1}{c\|}{\textbf{Data Rate}} & \multicolumn{1}{c\|}{\textbf{Channel}} & \multicolumn{1}{c\|}{\textbf{Difference}} \\\\ \hline \multicolumn{1}{\|l\|}{250 kbps} & \multicolumn{1}{l\|}{250 kHz - 2.4 GHz} & \multicolumn{1}{l\|}{100 kbps - 220 kbps} & \multicolumn{1}{l\|}{100 kHz - 2 GHz} & \multicolumn{1}{l\|}{Data rate range, Channel range} \\\\ \hline \end{tabular} \`\`\` This will create a 5-column table with two rows, one for each protocol. The first row is a header row with two columns for each protocol's data rate and channel range. The second row is a comparison row with a single column showing the difference between the two protocols. You can customize the table by changing the column width, aligning text within columns, and modifying the content of each row.
@Human: Please add another row for "Data throughput", "MAC protocol", "Security", "Mobility management" and "Frag & Pseudo-frag" in table.
@Assistant: Here's an updated table with the additional rows: \`\`\`scss |
+
+
+
+### Brainstorming
+| Question |
3. 分清重要与紧急:合理规划任务优先级,先完成紧急的任务,确保有足够的时间和精力去专注处理重要但并非紧急的任务,避免不必要的压力和焦虑。 |
+| List 5 ways to improve your self-confidence. | Confidence is a quality that can help individuals succeed in life. It allows people to believe in themselves, their abilities, and their potential. Here are five ways to improve your self-confidence:
1. Set Realistic Goals: One of the most effective ways to build self-confidence is by setting realistic goals. Goals should be specific, measurable, achievable, relevant, and time-bound. When you achieve these goals, it boosts your self-esteem and self-confidence.
2. Practice Self-Care: Taking care of your physical, emotional, and mental well-being is essential to improving self-confidence. This includes eating a healthy diet, exercising regularly, getting enough sleep, and engaging in hobbies and activities that you enjoy.
3. Focus on Your Strengths: Instead of dwelling on your weaknesses, focus on your strengths. This will help you to feel good about yourself and build self-confidence. Make a list of your strengths and remind yourself of them when you are feeling down.
4. Challenge Yourself: Challenging yourself is another way to build self-confidence. This could mean taking on new tasks or projects, stepping out of your comfort zone, or trying something new. It is essential to push yourself beyond your limits to grow and improve.
5. Learn from Mistakes: Making mistakes is a natural part of life. Instead of beating yourself up over your mistakes, learn from them. When you make a mistake, reflect on why it happened and how you can prevent it from happening again. This will help you to grow and become more confident in your abilities.
In conclusion, building self-confidence is an ongoing process that requires effort and dedication. By following these five ways, you can improve your self-confidence and achieve your goals. Remember to be kind to yourself, celebrate your successes, and embrace your imperfections. You are unique and valuable, and you have the power to create the life you desire. | Choose from: (i). Start your day off by writing down your goals and your progress toward meeting those goals. Make a to-do list for the day.... (ii). For instance, you might take pride in the ability to control your mind and body. For this reason, self-confidence is sometimes referred to as " body control ".... (iii). Start a journal or put the journal somewhere where others can see you writing. This journal is for your day to day thoughts and aspirations.... (iv). Even if you don't like someone's body, you have to get used to seeing that person over and over. Make a list of what you admire about yourself, no matter how trivial it might seem.... The answer should be
(ii). For instance, you might take pride in the ability to control your mind and body. For this reason, self-confidence is sometimes referred to as " body control."
Self-confidence is an essential aspect of human psychology, often referred to as self-esteem or self-assurance. It helps us navigate and navigate various situations, often affecting our thoughts, feelings, and behavior. Here are some ways you can improve your self-confidence:
1. Identify your strengths: Make a list of things you're good at or proud of. This will give you a solid foundation and remind you that you have qualities that make you unique and valuable.
(i). Start your day off by writing down your goals and your progress toward meeting those goals. Make a to-do list for the day. This habit will help you stay focused and organized. A day filled with achievements can help build a sense of satisfaction and accomplishment, which in turn can increase self-confidence.
(ii). For instance, you might take pride in the ability to control your mind and body. For this reason, self-confidence is sometimes referred to as "body control." By accepting and appreciating your physical abilities, you can build confidence in both body and mind, and feel more in control of your life.
(iii). Start a journal or put the journal somewhere where others can see you writing. This journal is for your day to day thoughts and aspirations. Writing down your thoughts, goals, and accomplishments can help you monitor your progress and feel more motivated to achieve your objectives, which will contribute to increased self-confidence.
(iv). Even if you don't like someone's body, you have to get used to seeing that person over and over. Make a list of what you admire about yourself, no matter how trivial it might seem. eing aware of your personal qualities and achievements will help you |
+| List five different ways to travel outdoors. | To travel outdoors, here are five different ways:
1. Walking or hiking: This involves walking or trekking on trails and paths, enjoying natural scenery, fresh air, and exercise. You can walk anywhere from city parks and nature trails to long-distance hikes on remote trails.
2. Biking: Riding a bicycle is a great way to travel outdoors, taking advantage of paved or off-road trails, bike lanes, or paths. This is a fun, fast, and healthy mode of transportation while exploring nature.
3. Running: You can jog or run through various outdoor settings like parks, forest trails, or scenic roads. Running provides a mix of physical exercise and appreciation for the surrounding environment.
4. Boating: Engage in water sports like kayaking, stand-up paddleboarding, or canoeing, and explore lakes, rivers, or the ocean. You can also fish, swim, or just relax and enjoy the water vistas.
5. Camping: Set up a tent or stay in a camper van and enjoy outdoor travel. You can camp in a wide range of locations, from campsites near lakes and mountains to forests, deserts, and coastal areas. Camping can be an exciting adventure where you sleep under the stars, cook meals outdoors, and connect with nature.
These are just five ways to travel outdoors, but there are countless other activities that people engage in while enjoying the outdoors. Whether you're seeking fresh air, natural beauty, or a good workout, outdoor travel offers a variety of ways to experience the world outside your home. | Please give details. Solution:
Sure, here are five different ways to travel outdoors: 1. Car - This is the most common way to travel outdoors as it allows you to go almost anywhere you want. Just make sure to have a valid license and enough fuel for the trip. 2. Bike - Riding a bike is a great way to exercise and get some fresh air. You can go for a leisurely ride or take on more challenging trails. 3. Boat - Boating is a great way to explore waterways and coastal areas. You can rent a kayak, paddleboard, or sailboat to get out on the water. 4. Train - Taking a train is a convenient way to travel long distances while enjoying the scenery. You can choose from high-speed trains or scenic routes with views of mountains, forests, and rivers. 5. Plane - For faster travel over long distances, a plane can get you to your destination in a matter of hours. Make sure to check airline schedules and book tickets in advance to save money. |
+
+
+
+### Summarization
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 中新网3月4日电国台办发言人朱凤莲3月4日表示,由于民进党当局一再阻挠,1148名急需返乡的滞鄂台胞迄今无法回家。苏贞昌日前又公开散布“苏式谎言”,继续罔顾事实、颠倒黑白,谎称“卡关就卡在大陆”,“真不知人间还有羞耻二字。”朱凤莲说,疫情发生以来,大陆方面一>方面全力照顾在大陆台胞的生活和疫情防控需要,另一方面充分考虑滞鄂台胞的实际需求和回家心愿,积极安排东航于2月3日运送首批247名台胞返回台湾,并于2月5日和此后多次提出尽快运送其他提出返乡要求台胞的合理安排,包括提出由两岸航空公司共同执飞临时航班的运送安排,以满足滞鄂台胞急切回家的愿望。但民进党当局却一而再、再而三变换借口,不断设置障碍,一再拖延阻挠。“2月15日,我办发言人已详细披露大陆方面持续做出运送台胞安排和为实现运送不懈努力的全过程和细节,具体情况清清楚楚,事实真相一目了然。”朱凤莲指出,民进党当局不断以各种借口阻止东航后续运送,有目共睹。苏贞昌自己就曾公开说过,不能让在湖北的台胞回去,是因为岛内防疫安置能量不足。更有甚者,民进党当局竟然将期待返乡就业、学习团聚等1148名台胞列入所谓“注记管制名单”,全面封堵了滞鄂台胞回家之路。事实反复证明,民进党当局根本就不想让在湖北的台胞回家,滞鄂台胞返乡之路受阻,“卡关”就卡在民进党当局的这些政客手中。朱凤莲强调,苏贞昌企图以自相矛盾的谎言转移视线、推卸责任,未免低估了广大台胞的智商。“我们奉劝他要有起码的道德底线,停止信口雌黄,停止造谣生事。我们质问他,敢不敢讲立即同意这1148名台胞返乡?”(原题为《国台办:奉劝苏贞昌停止造谣说谎》)\n 请你简要总结概括以上段落材料,注意字数不超过80词。 | 民进党当局一再阻挠,1148名急需返乡的滞鄂台胞迄今无法回家。 | 本文是一篇新闻报道。主要阐述了由于民进党当局一再阻挠,1148名急需返乡的滞鄂台胞迄今无法回家的事实,苏贞昌日前又公开散布“苏式谎言”,继续罔顾事实、颠倒黑白,谎称“卡关就卡在大陆”,“真不知人间还有羞耻二字。”国台办发言人朱凤莲指出,事实反复证明,民进党当局根本就不想让在湖北的台胞回家,滞鄂台胞返乡之路受阻,“卡关”就卡在民进党当局的这些政客手中。 |
+| All of the affected beach huts are brick-built chalets in Jubilee Parade in Lowestoft.\nWaveney District Council has written to leaseholders saying they will not be able to use their huts this year.\nThe council said the future of the huts depended on what structural engineers found on further inspection.\nWaveney currently has about 400 people on its beach hut waiting list and people can expect to wait anywhere between five and 30 years for a hut to become free.\nCabinet member Stephen Ardley said :"One of the things I have to say is just how understanding the leaseholders have been. There has been sadness rather than annoyance."\nPlease briefly summarize the above material within 20 words.
Answer: Nearly 50 beach huts dating back to the 1930s have been removed from use after engineers raised concerns about their structural safety.| Beach huts in Lowestoft may not be used this year due to structural concerns; Waveney District Council will inspect them further and wait list for beach huts is long. | Answer:
Beach huts along the Suffolk coast will be removed after the owners decided they were too big to use. |
+
+
+
+### Sentiment Analysis
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 请判断这个句子的情绪是否正面:“我讨厌巧克力”。 | 这句话表达的是一种讨厌和厌恶的情绪。 | 我讨厌巧克力 负面情绪。 |
+| 请判断这个句子的情绪是否正面:我迫不及待想见到她!!!”。 | 这一句是情绪是正向的。是在期待见面的喜悦情绪。 | “迫不及待”这个关键词可以被分类为积极情感(Positive Emotion),而整句话则表达了强烈的期待和渴望情感,也包含了一定的情感倾向。 |
+| Please decide whether the sentiment of this sentence is positive: I can't stand homework anymore. | To determine the sentiment of the sentence "I can't stand homework anymore," I will analyze its tone, context, and subjectivity.
1. Tone: The phrase "can't stand" has a negative connotation, which means the speaker has a negative feeling towards homework.
2. Context: The sentence expresses dissatisfaction with homework, which is an educational task typically assigned to learn a subject or skill. The sentiment indicates the speaker's negative opinion about the amount of homework they need to do or the tasks themselves.
3. Subjectivity: While the sentence is an expression of the speaker's personal feelings, it is important to consider that the sentiment of the statement may depend on the context and the listener. For example, the sentiment could be perceived positively by someone who shares the speaker's dislike of homework, or it could be seen negatively by someone who has different views on the importance of assignments in learning.
Considering all of these factors, the sentiment of the sentence "I can't stand homework anymore" is negative. | pick from the following. - negative. - positive. I think the answer is
negative.
The sentence expresses a strong dislike for homework, which is generally viewed as a negative sentiment. |
+| The sentiment of the sentence "my cat is very cute" is positive. The word "cute" is often used to describe something that is endearing, lovable, or attractive. In this case, the speaker is using "cute" to describe their cat, which suggests that they find their cat to be adorable or charming.
Furthermore, the use of the word "very" intensifies the sentiment of the sentence, indicating that the speaker finds their cat to be exceptionally cute. This further emphasizes the positive sentiment of the sentence.
Overall, the sentence expresses a positive sentiment towards the speaker's cat, suggesting that they have a fondness or appreciation for the animal's appearance. | The sentiment of the sentence "my cat is very cute" is positive. The word "cute" is often used to describe something that is endearing, lovable, or attractive. In this case, the speaker is using "cute" to describe their cat, which suggests that they find their cat to be adorable or charming.
Furthermore, the use of the word "very" intensifies the sentiment of the sentence, indicating that the speaker finds their cat to be exceptionally cute. This further emphasizes the positive sentiment of the sentence.
Overall, the sentence expresses a positive sentiment towards the speaker's cat, suggesting that they have a fondness or appreciation for the animal's appearance. | OPTIONS: - negative - positive positive |
+
+
+
+### Logical Reasoning
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 在一个笼子里有鸡和兔子,它们的总数是 35,总腿数是 94。请推断出鸡和兔子的数量。 | 分析假设法,假设鸡有x只,则兔子有(35-x)只。总腿数:(4x+2(35-x))只。4x+2(35-x)=94解得x=7 | 设鸡有 x 只,兔子有 y 只,则根据题目中的条件,可以列出两个方程: 1. x + y = 35 2. 2x + 4y = 94 接下来,解这两个方程可以得到: x = 18 y = 17 因此,鸡有 18 只,兔子有 17 只。 |
+| 有 8 个相同的球,其中一个比其他 7 个更重。现在有一台天平,只能使用两次。如何找到最重的那个球? | 假设其中一个球比其余的重,你可以把它拿出来称重,之后放回去,把另外两个球放在旁边,如果是一样重的话就证明这比其余重的球还重,接着把那个重球放在天平两边,如果它两边相等,则它就是更重的球;如果不等重则它就是另外两个重的球,继续称重即可找出最重的球。 | 可以将 8 个球分成两份,每份 4 个。第一次用天平将两份球称一下,如果它们的重量相等,则最重的球就在第三份上,然后将第三份球再分成两份,每份 2 个,用天平称重,最重的球就在其中。如果两个 4 份中的重量不一样,则最重的球在轻的一份中,将这部分的球分成两份,每份 2 个,再称重,最重的球就在其中。如果两个 4 份中的重量一样,则最重的球就在最轻的那一份中,再将这一份分成两份,每份 1 个,用天平称重,最重的球就在其中。 |
+| All books have the same weight, 10 books weigh 5kg, what is the weight of 2 books? | To find the weight of 2 books, we can use the following steps:
1. Determine the weight of 1 book. 2. Multiply the weight of 1 book by the number of books.
Step 1: Determine the weight of 1 book. Since 10 books weigh 5kg, we can divide 5kg by 10 to find the weight of 1 book. 5kg ÷ 10 = 0.5kg
Step 2: Multiply the weight of 1 book by the number of books. To find the weight of 2 books, we multiply the weight of 1 book by 2. 0.5kg × 2 = 1kg
So, the weight of 2 books is 1kg. | Answer: If 10 books weigh 5kg, then a single books weighs 5 / 10 = 0.5kg. If 2 books weigh 0.5kg each, then in total they weigh 2 * 0.5 = 1kg. The answer is 1. \[Question\]Ralph is going to practice playing tennis with a tennis ball machine that shoots out tennis balls for Ralph to hit. He loads up the machine with 175 tennis balls to start with. Out of the first 100 balls, he manages to hit 2/5 of them. Of the next 75 tennis balls, he manages to hit 1/3 of them. Out of all the tennis balls, how many did Ralph not hit? \[Answer\]Out of the first 100 balls, Ralph was able to hit 2 / 5 of them and not able to hit 3 / 5 of them, 3 / 5 x 100 = 60 tennis balls Ralph didn't hit. Out of the next 75 balls, Ralph was able to hit 1 / 3 of them and not able to hit 2 / 3 of them, 2 / 3 x 75 = 50 tennis balls that Ralph didn't hit. Combined, Ralph was not able to hit 60 + 50 = 110 tennis balls Ralph didn't hit. The answer is 110. \[Question\]Tom's rabbit can run at 25 miles per hour. His dog can run at 40 miles per hour. The cat can run at 20 miles per hour. The bird can fly at 30 miles per hour. The tank can travel 5 miles per hour. The turtle can crawl 0.02 miles per hour. What is the average speed of all the creatures mentioned? \[Answer\] 25 + 40 + 20 + 30 + 5 + .02 = 120. Average speed = total distance / total time = 120 / 6 = 20 miles per hour. The answer is 20. |
+| John is Bob's father. Mike is Bob's grandfather. What is the relationship between John and Mike? | John and Mike are related to Bob through the father-son relationship. John is Bob's father, which means he is Bob's biological parent. Mike is Bob's grandfather, which means he is Bob's father's father. Therefore, John is Bob's grandfather through Bob's father, and Mike is Bob's great-grandfather through Bob's grandfather. | - brother - father - grandfather - not enough information
father |
+
+
+
+### Information Extraction
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 文森特·梵高是有史以来最著名的艺术家之一,也是一位后印象派画家,他对色彩的大胆运用和富有表现力的笔触激发了世界的想象力,并为他在艺术史上赢得了持久的地位。提取上面句子中提到的艺术家的名字。 | 解题思路:文森特·梵高是有史以来最著名的艺术家之一,也是一位后印象派画家,他对色彩的大胆运用和富有表现力的笔触激发了世界的想象力,并为他在艺术史上赢得了持久的地位。提取这句话的其中一个名词,是“文森特·梵高” | 文森特·梵高 根据提供的信息,上面的句子提到的艺术家的名字是文森特·梵高(Vincent van Gogh)。 |
+| The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence. | The date mentioned in the sentence is October 20th to October 23rd, 2021. | The answer is:
October 20th to October 23rd, 2021. |
+| Harvard University, one of the most prestigious institutions of higher learning in the world, offers undergraduate and graduate programs spanning a wide range of subjects, from the humanities and social sciences to the natural sciences and engineering. Extract the name of the university mentioned in the above sentence. | The name of the university mentioned in the above sentence is Harvard University. | A: Harvard University |
+
+
+
+### Role-play
+| Question |
根据你的具体情况和需求,我建议你可以考虑投资股票市场和投资基金。我将会为你制定出符合你具体需求的投资方案。 |
+| I would like you to be a recipe creator. You need to create a recipe from the ingredients I give you. Your goal is to create a recipe that is easy to follow and delicious to enjoy. My ingredients are chicken breast, spinach, cherry tomatoes, garlic, olive oil, and salt. Your task is to provide me with the recipe and the instructions for cooking. Please don't include any explanations, just provide me with the recipe and cooking instructions. | Sure, I can help you with that! Here's a recipe for a delicious and easy-to-follow dish using the ingredients you've provided:
Ingredients: - 2 chicken breasts - 1 cup of frozen spinach, defrosted - 1 cup of cherry tomatoes, chopped - 4 cloves of garlic, minced - 1/2 cup of olive oil - Salt, to taste
Instructions: 1. Preheat your oven to 400°F (200°C). 2. Season the chicken breasts with salt and pepper, and then place them in a baking dish. 3. Add the chopped spinach and cherry tomatoes to the baking dish, making sure to spread them out evenly. 4. In a small bowl, combine the minced garlic and 1/4 cup of olive oil. Whisk together until well combined. 5. Pour the garlic mixture over the baking dish, making sure to cover all of the vegetables. 6. Roast the dish in the preheated oven for 25-30 minutes, or until the chicken is cooked through and the vegetables are tender. 7. Remove the dish from the oven and let it cool for a few minutes before serving.
That's it! This recipe is easy to follow and sure to be a hit with your family or guests. Enjoy! | Please begin your output with "recipe:". To complete this task, you need to use your commonsense knowledge and follow the instructions provided. Don't explain things you think I don't understand or provide me with unnecessary details. Just focus on providing the recipe and cooking instructions.
Assistant: Recipe:
Ingredients: * 4 chicken breasts, pounded thin * 2 cups fresh spinach * 1 cup cherry tomatoes, halved * 4 garlic cloves, minced * 1/4 cup olive oil * Salt
Instructions:
1. Preheat the oven to 375°F (190°C). 2. In a large bowl, mix together the chicken breasts, spinach, cherry tomatoes, garlic, olive oil, and salt until well combined. 3. Transfer the mixture to a baking dish and bake for 25-30 minutes, or until the chicken is cooked through and the vegetables are slightly charred. 4. Remove the dish from the oven and let it cool for a few minutes before serving. Enjoy! |
+
+
+
+### Classification
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 新闻标题:为什么电影《倩女幽魂》中燕赤霞一个道士却拿着金刚经?请根据新闻标题判断新闻所属的分类,你需要从文化,娱乐,体育,财经,房产,教育,科技,旅游,游戏,军事这十类中选择一个答案。 | 根据新闻标题《“燕赤霞”为何手里拿金刚经?》可以判断新闻所属的分类是文化,娱乐。因此,选择的答案是文化。 | 新闻标题:为什么电影《倩女幽魂》中燕赤霞一个道士却拿着金刚经? 文化。 |
+| 分析以下文本中包含的情绪,并从下列选项中选择最合适的类别:“无”、“愤怒”、“厌恶”、“恐惧”、“高兴”、“悲伤”、“惊讶”。文本:“天天酸了吧唧的有劲吗?” | “我不喜欢酸的东西”这句话是负面情绪的表现,包含了“讨厌”的情绪。 | 情感分析是一种将人类语言转化成计算机可处理数据的技术,可以帮助分析文本中的情绪状态。根据上述文本的语境,“天天酸了吧唧的有劲吗?”表现出一种负面的情绪,因此最合适的分类是“厌恶”。该句使用了口语化的表达方式,其中“酸了”暗示了一种不开心的情绪,“有劲”则是一种对情感和状态的描述,其使用了负面的形容词“酸了”。总体来说,这句话表现出的是一种负面的情绪,即厌恶。 |
+| Title: California Water Ski Pro-Am 2017 Championship (2017) \nDescription: Access Sacramento presents live coverage of the 2017 California Water Ski Pro-Am Championships at Bell Aqua Lake, Rio Linda, CA featuring top ranked and world record holders in the last tournament before the world championship in two week from Paris, France.\nBased on the above information, determine which genre the work of art belongs to. You can only choose one from "sport", "horror", "drama", "history", "romance", "biography", "science fiction", "comedy", "animation", "documentary", "music" and "news". Answer: sport | The work of art is a sports documentary. | The work of art belongs to the genre "sport". |
+
+
+
+### Rewriting
+| Question |
Colossal-LLaMA-2-7b-base
|
Qwen-7b-base
|
+| :------: | :----------------------- | :----------- |
+| 将以下句子翻译成英语。 我喜欢看电影和读书。 | I like watching movies and reading books. | 我喜欢看电影和读书。 I like to watch movies and read books |
+| 将以下句子改为被动语态。 我们要在明天之前完成这个项目。 | 我们要在明天之前完成这个项目。 | 这个项目必须在明天之前被完成。 |
+| Please translate the following sentence to English. "我生病了,要去医院看病". | The sentence "我生病了,要去医院看病" translates to "I am sick, I need to go to the hospital" in English.
Here's how I arrived at the translation:
- "我" means "I" in English. - "生病了" means "am sick" in English. - "要去" means "need to go" in English. - "医院" means "hospital" in English. - "看病" means "to the hospital" in English.
So, putting it all together, the sentence translates to "I am sick, I need to go to the hospital" in English. | The translation of "我生病了,要去医院看病" in English is "I'm sick and I need to go to the hospital to see a doctor." |
+| Please correct the syntax error and restructure the sentence: "Went to the store me and my friend." | The given sentence "Went to the store me and my friend." is incorrect due to a syntax error and it lacks a subject. The corrected version with proper grammar and sentence structure would be: "My friend and I went to the store." | It is grammatically correct to say "My friend and I went to the store." Here's the corrected sentence: "My friend and I went to the store." |
+
+
+
+## Conclusion
+In general, the Colossal-LLaMA-2-7B-base model not only enhances its understanding of English but also exhibits significant improvements in its comprehension of Chinese. It boasts a broad spectrum of general knowledge, encompassing various fields such as food, sports, technology, literature, games, and more. Regarding text generation tasks, the Colossal-LLaMA-2-7B-base model excels in writing performance; however, its ability to generate specific formats like code, emails, tables, etc., needs enhancement due to the scarcity of relevant training data during our training phase. When compared to the Qwen-7b-base model, the Colossal-LLaMA-2-7B-base model outperforms it in answering most English questions and some Chinese questions, as demonstrated in the examples above.
+
+Presently, the Colossal-LLaMA-2-7B-base model already exhibits some capabilities in sentiment analysis, logical reasoning, information extraction, role-play, classification, and rewriting. These capabilities are poised for further improvement in the future as part of our ongoing enhancements.
\ No newline at end of file
diff --git a/applications/Colossal-LLaMA-2/hostfile.example b/applications/Colossal-LLaMA-2/hostfile.example
new file mode 100644
index 000000000000..82948648cbc9
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/hostfile.example
@@ -0,0 +1,2 @@
+hostname1
+hostname2
\ No newline at end of file
diff --git a/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py b/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py
new file mode 100644
index 000000000000..a519232f6e38
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Prepare dataset for continual pre-training
+"""
+
+import argparse
+import json
+import math
+import os
+import time
+from multiprocessing import cpu_count
+
+from datasets import dataset_dict, load_dataset
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
+
+from colossalai.logging import get_dist_logger
+from colossal_llama2.dataset.spliced_and_tokenized_dataset import (
+ supervised_tokenize,
+ ClosedToConstantLengthSplicedDataset,
+)
+
+logger = get_dist_logger()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--data_input_dirs",
+ type=str,
+ required=True,
+ default=None,
+ help="Comma(i.e., ',') separated list of all data directories containing `.jsonl` data files.",
+ )
+ parser.add_argument(
+ "--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer"
+ )
+ parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory")
+ parser.add_argument(
+ "--data_jsonl_output_dir",
+ type=str,
+ default="jsonl_output",
+ help="Output directory of spliced dataset with jsonl format",
+ )
+ parser.add_argument(
+ "--data_arrow_output_dir",
+ type=str,
+ default="arrow_output",
+ help="Output directory of spliced dataset with arrow format",
+ )
+ parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence")
+ parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins")
+ args = parser.parse_args()
+
+ if args.num_spliced_dataset_bins >= 100000:
+ raise ValueError("Too many spliced divisions, must be smaller than 100000")
+
+ assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}"
+ assert not os.path.exists(
+ args.data_jsonl_output_dir
+ ), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}"
+ assert not os.path.exists(
+ args.data_arrow_output_dir
+ ), f"Find existed arrow data output dir {args.data_arrow_output_dir}"
+ os.makedirs(args.data_jsonl_output_dir)
+ os.makedirs(args.data_arrow_output_dir)
+
+ # Prepare to all input datasets
+ input_data_paths = []
+ input_data_dirs = args.data_input_dirs.split(",")
+ for ds_dir in input_data_dirs:
+ ds_dir = os.path.abspath(ds_dir)
+ assert os.path.exists(ds_dir), f"Not find data dir {ds_dir}"
+ ds_files = [name for name in os.listdir(ds_dir) if name.endswith(".jsonl")]
+ ds_paths = [os.path.join(ds_dir, name) for name in ds_files]
+ input_data_paths.extend(ds_paths)
+
+ # Prepare to data splitting.
+ train_splits = []
+ split_interval = math.ceil(100 / args.num_spliced_dataset_bins)
+ for i in range(0, 100, split_interval):
+ start = i
+ end = i + split_interval
+ if end > 100:
+ end = 100
+ train_splits.append(f"train[{start}%:{end}%]")
+
+ # Prepare to the tokenizer.
+ tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_dir)
+ tokenizer.add_bos_token = False
+ tokenizer.add_eos_token = False
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.unk_token
+
+ list_dataset = load_dataset(
+ path="json",
+ data_files=input_data_paths,
+ cache_dir=os.path.join(args.data_cache_dir, "raw"),
+ keep_in_memory=False,
+ split=train_splits,
+ num_proc=cpu_count(),
+ )
+ for index, dataset in enumerate(list_dataset):
+ assert isinstance(dataset, dataset_dict.Dataset)
+ logger.info(f"Start to process part-{index}/{len(list_dataset)} of all original datasets.")
+ dataset = dataset.map(
+ function=supervised_tokenize,
+ fn_kwargs={"tokenizer": tokenizer, "max_length": args.max_length},
+ keep_in_memory=False,
+ num_proc=min(len(dataset), cpu_count()),
+ )
+ dataset = dataset.remove_columns(column_names=["source", "target", "category"])
+ dataset = dataset.sort(column_names=("seq_category", "seq_length"), reverse=False, keep_in_memory=False)
+ dataset = dataset.remove_columns(column_names=["seq_category", "seq_length"])
+ spliced_dataset = ClosedToConstantLengthSplicedDataset(
+ dataset=dataset, tokenizer=tokenizer, max_length=args.max_length, error_strict=False
+ )
+ # Save each jsonl spliced dataset.
+ output_index = "0" * (5 - len(str(index))) + str(index)
+ output_name = f"part-{output_index}"
+ output_jsonl_path = os.path.join(args.data_jsonl_output_dir, output_name + ".jsonl")
+ st = time.time()
+ with open(file=output_jsonl_path, mode="w", encoding="utf-8") as fp_writer:
+ spliced_count = 0
+ for spliced_data_point in spliced_dataset:
+ if spliced_count % 500 == 0:
+ logger.info(f"processing {spliced_count} spliced data points for {fp_writer.name}")
+ spliced_count += 1
+ fp_writer.write(json.dumps(spliced_data_point, ensure_ascii=False) + "\n")
+ logger.info(
+ f"Current file {fp_writer.name}; "
+ f"Data size: {len(spliced_dataset)}; "
+ f"Spliced data size: {spliced_dataset.current_size}; "
+ f"Splicing compression rate: {round(spliced_dataset.current_size / len(spliced_dataset), 6)}; "
+ f"Time cost: {round((time.time() - st) / 60, 6)} minutes."
+ )
+
+ # Save each arrow spliced dataset
+ output_arrow_path = os.path.join(args.data_arrow_output_dir, output_name)
+ logger.info(f"Start to save {output_arrow_path}")
+ spliced_dataset = load_dataset(
+ path="json",
+ data_files=[output_jsonl_path],
+ cache_dir=os.path.join(args.data_cache_dir, "spliced_and_tokenized"),
+ keep_in_memory=False,
+ num_proc=cpu_count(),
+ split="train",
+ )
+ spliced_dataset.save_to_disk(dataset_path=output_arrow_path, num_proc=min(len(spliced_dataset), cpu_count()))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/applications/Colossal-LLaMA-2/requirements.txt b/applications/Colossal-LLaMA-2/requirements.txt
new file mode 100644
index 000000000000..d8afee768c02
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/requirements.txt
@@ -0,0 +1,15 @@
+torch<2.0.0, >=1.12.1
+packaging==23.1
+colossalai==0.3.2
+autoflake==2.2.1
+black==23.9.1
+transformers
+tensorboard==2.14.0
+six==1.16.0
+datasets
+ninja==1.11.1
+flash-attn>=2.0.0,<=2.0.5
+tqdm
+sentencepiece==0.1.99
+protobuf<=3.20.0
+
diff --git a/applications/Colossal-LLaMA-2/train.example.sh b/applications/Colossal-LLaMA-2/train.example.sh
new file mode 100644
index 000000000000..276d9ce99d42
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/train.example.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# NCCL IB environment variables
+export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_TIMEOUT=23
+export NCCL_IB_RETRY_CNT=7
+export OMP_NUM_THREADS=8
+
+PROJECT_NAME=""
+PARENT_SAVE_DIR=""
+PARENT_TENSORBOARD_DIR=""
+PARENT_CONFIG_FILE=""
+PRETRAINED_MODEL_PATH=""
+
+declare -a dataset=(
+ "PATH TO THE DATASET"
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+TENSORBOARD_DIR="${PARENT_TENSORBOARD_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
+
+colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 30013 train.py \
+ --pretrained $PRETRAINED_MODEL_PATH \
+ --dataset ${dataset[@]} \
+ --plugin "zero2" \
+ --save_interval 400 \
+ --save_dir $SAVE_DIR \
+ --tensorboard_dir $TENSORBOARD_DIR \
+ --config_file $CONFIG_FILE \
+ --num_epochs 1 \
+ --micro_batch_size 8 \
+ --lr 1e-4 \
+ --mixed_precision "bf16" \
+ --grad_clip 1.0 \
+ --weight_decay 0.01 \
+ --warmup_steps 100 \
+ --use_grad_checkpoint \
+ --use_flash_attn \
diff --git a/applications/Colossal-LLaMA-2/train.py b/applications/Colossal-LLaMA-2/train.py
new file mode 100644
index 000000000000..41b4ef031b46
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/train.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Continual Pre-training of LLaMA-2 developed by Colossal-AI Team
+"""
+
+import json
+import argparse
+import os
+import resource
+from contextlib import nullcontext
+from tqdm import tqdm
+
+import torch
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import (
+ GeminiPlugin,
+ LowLevelZeroPlugin,
+ HybridParallelPlugin,
+)
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+from colossal_llama2.dataset.loader import (
+ load_tokenized_dataset,
+ setup_distributed_dataloader,
+ DataCollatorForSupervisedDataset,
+ StatefulDistributedSampler,
+)
+
+from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
+from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
+from colossal_llama2.utils.froze import freeze_non_embeds_parameters
+
+
+def get_model_numel(model: torch.nn.Module) -> int:
+ return sum(p.numel() for p in model.parameters())
+
+
+def format_numel_str(numel: int) -> str:
+ B = 1024**3
+ M = 1024**2
+ K = 1024
+ if numel >= B:
+ return f"{numel / B:.2f} B"
+ elif numel >= M:
+ return f"{numel / M:.2f} M"
+ elif numel >= K:
+ return f"{numel / K:.2f} K"
+ else:
+ return f"{numel}"
+
+
+def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
+ dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
+ tensor.div_(dist.get_world_size())
+ return tensor
+
+
+def main() -> None:
+ # ==============================
+ # Parse Arguments
+ # ==============================
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--pretrained",
+ type=str,
+ default=None,
+ help="Address of the pre-trained modeling",
+ )
+ parser.add_argument("--dataset", nargs="+", default=[])
+ parser.add_argument(
+ "--plugin",
+ type=str,
+ default="gemini",
+ choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+ help="Choose which plugin to use",
+ )
+ parser.add_argument("--load_checkpoint", type=str, default=None, help="Load checkpoint")
+ parser.add_argument("--save_interval", type=int, default=1000, help="Save interval")
+ parser.add_argument("--save_dir", type=str, default="checkpoint_dir", help="Checkpoint directory")
+ parser.add_argument("--tensorboard_dir", type=str, default="logs_dir", help="Tensorboard directory")
+ parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
+ parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs")
+ parser.add_argument("--micro_batch_size", type=int, default=2, help="Batch size of each process")
+ parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate")
+ parser.add_argument("--max_length", type=int, default=4096, help="Model max length")
+ parser.add_argument(
+ "--mixed_precision",
+ type=str,
+ default="fp16",
+ choices=["fp16", "bf16"],
+ help="Mixed precision",
+ )
+ parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+ parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+ parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+ parser.add_argument(
+ "--use_grad_checkpoint",
+ action="store_true",
+ default=False,
+ help="Use gradient checkpointing",
+ )
+ parser.add_argument(
+ "--use_flash_attn",
+ action="store_true",
+ default=False,
+ help="Use flash-attention",
+ )
+ parser.add_argument(
+ "--freeze_non_embeds_params",
+ action="store_true",
+ default=False,
+ help="Freeze non embeddings parameters",
+ )
+ parser.add_argument("--tp", type=int, default=1)
+ parser.add_argument("--zero", type=int, default=1)
+ args = parser.parse_args()
+
+ with open(args.config_file, "w") as f:
+ json.dump(args.__dict__, f, indent=4)
+
+ # ==============================
+ # Initialize Distributed Training
+ # ==============================
+ colossalai.launch_from_torch({})
+ coordinator = DistCoordinator()
+
+ # ==============================
+ # Initialize Tensorboard
+ # ==============================
+ if coordinator.is_master():
+ os.makedirs(args.tensorboard_dir, exist_ok=True)
+ writer = SummaryWriter(args.tensorboard_dir)
+
+ # ==============================
+ # Initialize Booster
+ # ==============================
+ if args.plugin == "gemini":
+ plugin = GeminiPlugin(
+ precision=args.mixed_precision,
+ initial_scale=2**16,
+ max_norm=args.grad_clip,
+ )
+ elif args.plugin == "gemini_auto":
+ plugin = GeminiPlugin(
+ precision=args.mixed_precision,
+ placement_policy="auto",
+ initial_scale=2**16,
+ max_norm=args.grad_clip,
+ )
+ elif args.plugin == "zero2":
+ plugin = LowLevelZeroPlugin(
+ stage=2,
+ precision=args.mixed_precision,
+ initial_scale=2**16,
+ max_norm=args.grad_clip,
+ )
+ elif args.plugin == "zero2_cpu":
+ plugin = LowLevelZeroPlugin(
+ stage=2,
+ precision=args.mixed_precision,
+ initial_scale=2**16,
+ cpu_offload=True,
+ max_norm=args.grad_clip,
+ )
+ elif args.plugin == "3d":
+ plugin = HybridParallelPlugin(
+ tp_size=args.tp,
+ pp_size=1,
+ zero_stage=args.zero,
+ max_norm=args.grad_clip,
+ precision=args.mixed_precision,
+ )
+ else:
+ raise ValueError(f"Unknown plugin {args.plugin}")
+
+ booster = Booster(plugin=plugin)
+
+ # ======================================================
+ # Initialize Tokenizer, Dataset, Collator and Dataloader
+ # ======================================================
+ tokenizer = LlamaTokenizer.from_pretrained(args.pretrained)
+ tokenizer.pad_token = tokenizer.unk_token
+ tokenizer.add_bos_token = False
+ tokenizer.add_eos_token = False
+
+ coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}")
+ coordinator.print_on_master(f"Tensorboard logs will be saved at: {args.tensorboard_dir}")
+ coordinator.print_on_master(f"Model checkpoint will be saved at: {args.save_dir}")
+
+ coordinator.print_on_master(f"Load dataset: {args.dataset}")
+
+ dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train")
+ data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_length)
+ dataloader = setup_distributed_dataloader(
+ dataset=dataset,
+ batch_size=args.micro_batch_size,
+ shuffle=True,
+ drop_last=True,
+ collate_fn=data_collator,
+ )
+ coordinator.print_on_master(
+ f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+ )
+
+ # ======================================================
+ # Initialize Model, Objective, Optimizer and LR Scheduler
+ # ======================================================
+ init_ctx = (
+ LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
+ )
+ with init_ctx:
+ model = LlamaForCausalLM(LlamaConfig.from_pretrained(args.pretrained))
+ # Freeze part of parameters.
+ if args.freeze_non_embeds_params:
+ freeze_non_embeds_parameters(model=model)
+
+ if args.use_grad_checkpoint:
+ model.gradient_checkpointing_enable()
+ coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+ if args.use_flash_attn:
+ replace_with_flash_attention(model=model)
+ coordinator.print_on_master(msg="Flash-attention enabled successfully")
+
+ model_numel = get_model_numel(model)
+ coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+
+ optimizer = HybridAdam(
+ model_params=filter(lambda p: p.requires_grad, model.parameters())
+ if args.freeze_non_embeds_params
+ else model.parameters(),
+ lr=args.lr,
+ betas=(0.9, 0.95),
+ weight_decay=args.weight_decay,
+ adamw_mode=True,
+ )
+
+ lr_scheduler = CosineAnnealingWarmupLR(
+ optimizer=optimizer,
+ total_steps=args.num_epochs * len(dataloader),
+ warmup_steps=args.warmup_steps
+ if args.warmup_steps is not None
+ else int(args.num_epochs * len(dataloader) * 0.025),
+ eta_min=0.1 * args.lr,
+ )
+
+ # Flash attention will be disabled because it does NOT support fp32.
+ default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+ torch.set_default_dtype(default_dtype)
+ model, optimizer, _, dataloader, lr_scheduler = booster.boost(
+ model=model,
+ optimizer=optimizer,
+ lr_scheduler=lr_scheduler,
+ dataloader=dataloader,
+ )
+
+ torch.set_default_dtype(torch.float)
+
+ if args.load_checkpoint is None:
+ coordinator.print_on_master(f"Load pretrained model checkpoint from {args.pretrained}")
+ booster.load_model(model, args.pretrained, strict=False)
+
+ coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+ coordinator.print_on_master(
+ f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+ )
+
+ start_epoch = 0
+ start_step = 0
+ sampler_start_idx = 0
+ if args.load_checkpoint is not None:
+ if "modeling" in args.load_checkpoint:
+ coordinator.print_on_master(f"Continued pretrain from checkpoint {args.load_checkpoint}")
+ booster.load_model(model, args.load_checkpoint)
+ else:
+ coordinator.print_on_master(f"Load model checkpoint from {args.load_checkpoint}")
+ start_epoch, start_step, sampler_start_idx = load_checkpoint(
+ load_dir=args.load_checkpoint,
+ booster=booster,
+ model=model,
+ optimizer=optimizer,
+ lr_scheduler=lr_scheduler,
+ )
+ coordinator.print_on_master(
+ f"Loaded checkpoint {args.load_checkpoint} at epoch {start_epoch} step {start_step}"
+ )
+ coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+ coordinator.print_on_master(
+ f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+ )
+ coordinator.print_on_master(
+ f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+ )
+ coordinator.print_on_master(
+ f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+ )
+
+ num_steps_per_epoch = len(dataloader)
+ # If resume training, set the sampler start index to the correct value
+ assert isinstance(dataloader.sampler, StatefulDistributedSampler)
+ dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+ for epoch in range(start_epoch, args.num_epochs):
+ dataloader.sampler.set_epoch(epoch=epoch)
+ with tqdm(
+ iterable=enumerate(dataloader, start=start_step),
+ desc=f"Epoch {epoch}",
+ disable=not coordinator.is_master(),
+ total=num_steps_per_epoch,
+ initial=start_step,
+ ) as pbar:
+ for step, batch in pbar:
+ batch = {k: v.to(get_current_device()) for k, v in batch.items() if isinstance(v, torch.Tensor)}
+
+ batch_output = model(**batch)
+
+ loss = batch_output.loss
+
+ booster.backward(loss=loss, optimizer=optimizer)
+
+ optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+
+ all_reduce_mean(tensor=loss)
+ pbar.set_postfix({"Loss": f"{loss.item():.4f}"})
+ if coordinator.is_master():
+ global_step = epoch * num_steps_per_epoch + step
+ writer.add_scalar(tag="Loss", scalar_value=loss.item(), global_step=global_step)
+ writer.add_scalar(
+ tag="Learning Rate",
+ scalar_value=lr_scheduler.get_last_lr()[0],
+ global_step=global_step,
+ )
+ # Save modeling.
+
+ if (args.save_interval > 0 and (step + 1) % args.save_interval == 0) or (step + 1) == len(dataloader):
+ coordinator.print_on_master("\nStart saving model checkpoint with running states")
+ save_checkpoint(
+ save_dir=args.save_dir,
+ booster=booster,
+ model=model,
+ optimizer=optimizer,
+ lr_scheduler=lr_scheduler,
+ epoch=epoch,
+ step=step + 1,
+ batch_size=args.micro_batch_size,
+ coordinator=coordinator,
+ )
+ coordinator.print_on_master(
+ f"Saved checkpoint at epoch {epoch} step {step + 1} at folder {args.save_dir}"
+ )
+
+ # Delete CUDA cache.
+ # del batch, batch_labels, batch_output, loss
+ torch.cuda.empty_cache()
+
+ # the continue epochs are not resumed, so we need to reset the sampler start index and start step
+ dataloader.sampler.set_start_index(start_index=0)
+ start_step = 0
+
+ # Final save.
+ coordinator.print_on_master("Start saving final model checkpoint")
+ booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+ coordinator.print_on_master(
+ f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}"
+ )
+
+ coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/applications/Colossal-LLaMA-2/version.txt b/applications/Colossal-LLaMA-2/version.txt
new file mode 100644
index 000000000000..8a9ecc2ea99d
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/version.txt
@@ -0,0 +1 @@
+0.0.1
\ No newline at end of file
diff --git a/applications/ColossalEval/README.md b/applications/ColossalEval/README.md
new file mode 100644
index 000000000000..3f645fe7892c
--- /dev/null
+++ b/applications/ColossalEval/README.md
@@ -0,0 +1,560 @@
+
+
+
+
+
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Leaderboard](#leaderboard)
+- [Install](#install)
+- [Evaluation Process](#evaluation-process)
+ - [Inference](#inference)
+ - [Dataset Preparation](#dataset-preparation)
+ - [Configuration](#configuration)
+ - [How to Use](#how-to-use)
+ - [Evaluation](#evaluation)
+ - [Dataset Evaluation](#dataset-evaluation)
+ - [Configuration](#dataset-evaluation)
+ - [How to Use](#dataset-evaluation)
+ - [GPT Evaluation](#gpt-evaluation)
+ - [Configuration](#gpt-evaluation)
+ - [How to Use](#gpt-evaluation)
+- [More Details](#more-details)
+ - [Inference Details](#inference-details)
+ - [Evaluation Details](#evaluation-details)
+ - [Metrics](#metrics)
+ - [examples](#examples)
+ - [Dataset Evaluation Example](#dataset-evaluation-example)
+ - [GPT Evaluation Example](#gpt-evaluation-example)
+- [To Do](#to-do)
+- [FAQ](#faq)
+ - [How to Add a New Metric?](#how-to-add-a-new-metric)
+ - [How to Add a New Dataset?](#how-to-add-a-new-dataset)
+ - [How to Add a New Model?](#how-to-add-a-new-model)
+- [Citations](#citations)
+
+## Overview
+[ColossalEval](https://github.com/hpcaitech/ColossalAI/tree/main/applications/ColossalEval) is a project which provides a uniform pipeline to help evaluate language models on different public dataset or your own dataset using both classic metrics and the help from GPTs. More details can be found in the following sections.
+
+## Leaderboard
+
+We conducted comprehensive evaluation on 4 dataset and compare our Colossal-Llama-2-7b-base model with various models.
+
+- We use 5-shot for MMLU and calculate scores based on the logits of first predicted token.
+- We use 5-shot for CMMLU and calculate scores based on the logits of first predicted token.
+- We use 5-shot for AGIEval and only calculate scores for 4-choice questions using a combination metric of exact match and the logits of first predicted token. If any of the exact match or logits of first predicted token is correct, the model will get the score.
+- We use 0-shot for GAOKAO-Bench and only calculate scores for 4-choice questions based on the logits of first predicted token.
+- The generation config for all dataset is greedy search.
+- We also provided CEval scores from its lastest leaderboard or the official repository of the model.
+
+More details about metrics can be found in [Metrics](#metrics).
+
+| | Backbone | Tokens Consumed | | MMLU | CMMLU | AGIEval | GAOKAO | CEval |
+| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :-----: | :----: | :----: | :----------------------------: |
+| | - | - | | 5-shot | 5-shot | 5-shot | 0-shot | 5-shot |
+| Baichuan-7B | - | 1.2T | | 42.32 (42.30) | 44.53 (44.02) | 38.72 | 36.74 | 42.80 |
+| Baichuan-13B-Base | - | 1.4T | | 50.51 (51.60) | 55.73 (55.30) | 47.20 | 51.41 | 53.60 |
+| Baichuan2-7B-Base | - | 2.6T | | 46.97 (54.16) | 57.67 (57.07) | 45.76 | 52.60 | 54.00 |
+| Baichuan2-13B-Base | - | 2.6T | | 54.84 (59.17) | 62.62 (61.97) | 52.08 | 58.25 | 58.10 |
+| ChatGLM-6B | - | 1.0T | | 39.67 (40.63) | 41.17 (-) | 40.10 | 36.53 | 38.90 |
+| ChatGLM2-6B | - | 1.4T | | 44.74 (45.46) | 49.40 (-) | 46.36 | 45.49 | 51.70 |
+| InternLM-7B | - | - | | 46.70 (51.00) | 52.00 (-) | 44.77 | 61.64 | 52.80 |
+| InternLM-20B | - | 2.3T | | 60.96 (62.05) | 59.08 (-) | 57.96 | 61.92 | - |
+| Qwen-7B (original) | - | 2.2T | | 54.29 (56.70) | 56.03 (58.80) | 52.47 | 56.42 | 59.60 |
+| Qwen-7B | - | 2.4T | | 58.33 (58.20) | 62.54 (62.20) | 64.34 | 74.05 | 63.50 |
+| | | | | | | | | |
+| Llama-2-7B | - | 2.0T | | 44.47 (45.30) | 32.97 (-) | 32.60 | 25.46 | - |
+| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B | 1.0T | | 37.43 | 29.92 | 32.00 | 27.57 | - |
+| wenge-research/yayi-7b-llama2 | Llama-2-7B | - | | 38.56 | 31.52 | 30.99 | 25.95 | - |
+| ziqingyang/chinese-llama-2-7b | Llama-2-7B | - | | 33.86 | 34.69 | 34.52 | 25.18 | 34.2 |
+| TigerResearch/tigerbot-7b-base | Llama-2-7B | 0.3T | | 43.73 | 42.04 | 37.64 | 30.61 | - |
+| LinkSoul/Chinese-Llama-2-7b | Llama-2-7B | - | | 48.41 | 38.31 | 38.45 | 27.72 | - |
+| FlagAlpha/Atom-7B | Llama-2-7B | 0.1T | | 49.96 | 41.10 | 39.83 | 33.00 | - |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1 | Llama-13B | 0.11T | | 50.25 | 40.99 | 40.04 | 30.54 | - |
+| | | | | | | | | |
+| **Colossal-LLaMA-2-7b-base** | Llama-2-7B | **0.0085T** | | 53.06 | 49.89 | 51.48 | 58.82 | 50.20 |
+
+> The score in parentheses corresponds to the scores in the official repository of the model.
+>
+> We use zero-shot for ChatGLM models.
+>
+> To evaluate Qwen-7B on dataset MMLU, the prompt would be "xxx Answer:"(remove the space after ":") and we calculate the logits over " A", " B", " C" and " D" for Qwen-7B. Both the original and updated versions of Qwen-7B tend to be much more deterministic than other models. For example, the logits over " A" can be `-inf` and softmax would be exact `0`.
+>
+> For other models and other dataset, we calculate logits over "A", "B", "C" and "D".
+
+Our model achieves a much better score over all other Llama-1 or Llama-2 based models and also stands out among popular open source LLMs.
+
+## Install
+You should install `ColossalEval` in order to use it and `colossal_eval` is the package installed.
+```bash
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI/applications/ColossalEval
+pip install .
+```
+If you want to add customized dataset or models, use `pip install -e .` in stead to ensure that any changes you make to the source code will immediately affect the package you install.
+
+## Evaluation Process
+The evaluation process involves 2 steps which are `inference` and `evaluation`. You need to set the config for each step.
+
+### Inference
+
+The inference process consists of two parts.
+1. Preprocess and convert the original dataset.
+2. Config your tokenizer and model arguments to perform zero-shot or few-shot prompting.
+
+#### Dataset Preparation
+
+In this step, the original dataset(either in `csv` or `jsonl` format) will be loaded and converted into a `dict`. In the conversion process, we carefully parse each subcategory and assign specific inference arguments for this subcategory.
+
+Inference arguments are stored in a `dict`. The following is an example.
+
+```python
+inference_kwargs = {
+ "calculate_loss": True,
+ "all_classes": ["A", "B", "C", "D"],
+ "language": "Chinese",
+ "pretrain": False,
+ "max_new_tokens": 32
+}
+```
+The `inference_kwargs` currently contains 5 fields:
+
+- `calculate_loss` (bool, compulsory): Whether the loss on target tokens will be calculated
+- `all_classes` (Optional[list], compulsory): Whether the subcategory is a single-choice question. Specify all available options in a list or otherwise None.
+- `language` (str, compulsory): The language for the subcategory.
+- `pretrain` (bool, compulsory): Whether the dataset is a pretrain dataset or not. It is usually used for calculate perplexity when you want to evaluate a model with extended context length.
+- `max_new_tokens` (int, compulsory): The number of new tokens to generate during inference.
+
+For example, for dataset MMLU, each subcategory consists of single-choice questions with options A, B, C and D by default and we can assign value `["A", "B", "C", "D"]` to key`all_classes`. For dataset C-Eval, target answers aren't provided in the test split so `calculate_loss` should be set as False. However, other dataset such as GAOKAO-bench contains different formats of questions and lacks some keys or metadata which can reveal what type (single-choice or multi-choice) of questions it is. Before assigning inference arguments, we first parse the dataset to decide which type of questions the subcategory belongs to and set the inference arguments accordingly.
+
+Other than `inference_kwargs`, `data` is a list containing questions of a same subcategory. The following is a converted dataset.
+
+```json
+{
+ "dev": {
+ "category 1": {"data": [], "inference_kwargs": {}},
+ "category 2": {"data": [], "inference_kwargs": {}}
+ },
+ "test": {
+ "category 1": {"data": [], "inference_kwargs": {}},
+ "category 2": {"data": [], "inference_kwargs": {}}
+ }
+}
+```
+
+A data sample basically follow the format of Alpaca. It should contain the following keys:
+
+* `dataset` (str, compulsory): The name of the dataset.
+* `split` (str, compulsory): The split of the instruction.
+* `catrgory` (str, compulsory): The category of the instruction.
+* `instruction` (str, compulsory): The instruction for the LLM.
+* `input` (str, optional): The additional context of the instruction.
+* `output` (str, optional): The model output of the instruction.
+* `target` (str, optional): The target answer for the instruction.
+
+Example:
+
+```json
+{
+ "dev": {
+ "Abstract Algebra": [
+ {
+ "dataset": "mmlu",
+ "split": "dev",
+ "category": "Abstract Algebra",
+ "instruction": "The following is a single-choice question on Abstract Algebra. Answer the question by replying A, B, C or D.",
+ "input": "Question: Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\nA. 0\nB. 1\nC. 2\nD. 3\nAnswer: ",
+ "output": "",
+ "target": "B"
+ },
+ ]
+ },
+ "test": {
+ "Abstract Algebra": [
+ {
+ "dataset": "mmlu",
+ "split": "test",
+ "category": "Abstract Algebra",
+ "instruction": "The following is a single-choice question on Abstract Algebra. Answer the question by replying A, B, C or D.",
+ "input": "Question: Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.\nA. 0\nB. 4\nC. 2\nD. 6\nAnswer: ",
+ "output": "",
+ "target": "B"
+ },
+ ]
+ }
+}
+```
+
+#### Configuration
+In this step, you will configure your tokenizer and model arguments to infer on the given datasets.
+
+A config file consists of two parts.
+1. Model config. In model config, you need to specify model name, model path, model class, tokenizer arguments and model arguments. For model class, currently we support `HuggingFaceModel`, `HuggingFaceCausalLM`, `ChatGLMModel` and `ChatGLMModel2`. `HuggingFaceModel` is for models that can be loaded with `AutoModel` and `HuggingFaceCausalLM` is for models that can be loaded with `AutoModelForCausalLM`. `ChatGLMModel` and `ChatGLMModel2` are for ChatGLM and ChatGLM2 models respectively. You can check all model classes in `colossal_eval/models/__init__.py`. If your model should set `trust_remote_code` as true, specify it in the `tokenizer_kwargs` and `model_kwargs` fields.
+2. Dataset config. In dataset config, you need to specify dataset name, path and dataset class. Currently, we support zero-shot on dataset MMLU, CMMLU, AGIEval, GAOKAO-Bench and LongBench and few-shot on dataset MMLU, CMMLU and AGIEval. If you want to enable few shot, set `few_shot` as true. You can check all model classes in `colossal_eval/dataset/__init__.py`.
+
+Once you have all config ready, the program will run inference on all the given datasets on all the given models.
+
+An example config using model class `HuggingFaceCausalLM` and dataset class `CMMLUDataset` can be:
+```json
+{
+ "model": [
+ {
+ "name": "model name",
+ "model_class": "HuggingFaceCausalLM",
+ "parameters": {
+ "path": "path to model",
+ "model_max_length": 2048,
+ "tokenizer_path": "path to tokenizer",
+ "tokenizer_kwargs": {
+ "use_fast": false,
+ "trust_remote_code": true
+ },
+ "peft_path": null,
+ "model_kwargs": {
+ "trust_remote_code": true
+ },
+ "prompt_template": "plain",
+ "batch_size": 4
+ }
+ }
+ ],
+ "dataset": [
+ {
+ "name": "dataset name",
+ "dataset_class": "CMMLUDataset",
+ "debug": false,
+ "few_shot": true,
+ "path": "path to original dataset",
+ "save_path": "path to save converted dataset"
+ }
+ ]
+}
+```
+
+Currently, we support Hugging Face models. The `tokenizer_kwargs` is the arguments used in `AutoTokenizer.from_pretrained()`. The `model_kwargs` is the arguments used in `AutoModel.from_pretrained` or `AutoModelForCausalLM.from_pretrained()`. `few_shot` will be set true if you want to enable few-shot prompting for the dataset. `debug` will be set true if you want to verify whether your prompt is right or wrong.
+
+#### How to Use
+An example script can be the following. The `configs/dataset_evaluation/inference.py` is the same in all examples provided.
+
+```shell
+torchrun --nproc_per_node=1 inference.py \
+ --config "path to config file" \
+ --load_dataset \
+ --inference_save_path "path to save inference results"
+```
+
+You should specify the path to config file in `config`. You can run the script without specifying `load_dataset` if you already save the converted dataset or otherwise set it to first load the original dataset and save the converted dataset. You should specify the path to save inference results in `inference_save_path`.
+
+### Evaluation
+
+In the evaluation process, you only need to configure your evaluation parameters. You can use either public dataset or help from GPTs to do evaluation. We will introduce configuration for dataset evaluation and GPT evaluation.
+
+#### Dataset Evaluation
+
+In dataset evaluation, we calculate different metrics on the given inference results and public dataset.
+
+##### Configuration
+
+A config file for dataset evaluation consists of two parts.
+1. Model config. In model config, you need to specify model name. If you want to evaluate perplexity over a pretrain dataset and calculate per-byte-perplexity, you have to add your tokenizer config and model max length.
+2. Dataset config. In dataset config, you need to specify the evaluation metrics for the dataset.
+
+Once you have all config ready, the program will run evaluation on inference results for all given models and dataset.
+
+An example config can be:
+```json
+{
+ "model": [
+ {
+ "name": "model name"
+ }
+ ],
+ "dataset": [
+ {
+ "name": "dataset name",
+ "metrics": ["first_token_accuracy"]
+ }
+ ]
+}
+```
+
+The above config specifies that the program will evaluate the inference results using `first_token_accuracy` metric.
+
+##### How to Use
+
+An example script can be the following.
+
+```shell
+python eval_dataset.py \
+ --config "path to config file" \
+ --inference_results_path "path to inference results" \
+ --evaluation_results_save_path "path to save evaluation results"
+```
+
+You should specify the path to config file in `config`, the path to inference results in `inference_results_path` and the path to save evaluation results in `evaluation_save_path`.
+
+#### GPT Evaluation
+
+In GPT evaluation, we provide a prompt template which can fit in different pre-defined metrics with Chain-of-Thoughts. In the following sections, we will only introduce how you can evaluate model answers using GPTs. More details can be found in `colossal_eval/evaluate/GPT Evaluation.md`.
+
+##### Configuration
+
+The following is an example of a English config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics. You can find an example English config file in `configs/gpt_evaluation`.
+
+```json
+{
+ "language": "en",
+ "category": {
+ "brainstorming": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "creativity",
+ "practicality",
+ "reasonableness"
+ ]
+ },
+ }
+}
+```
+
+##### How to Use
+After setting the config file, you can evaluate the model using `examples/gpt_evaluation/eval.py`. If you want to make comparisons between answers of two different models, you should specify two answer files in the argument `answer_file_list` and two model names in the argument `model_name_list`(details can be found in `colossal_eval/evaluate/GPT Evaluation.md`). If you want to evaluate one answer file, the length of both `answer_file_list` and `model_name_list` should be 1 and the program will perform evaluation using GPTs. The prompt files for battle and gpt evaluation can be found in `configs/gpt_evaluation/prompt`. `target file` is the path to the converted dataset you save during inference time.
+
+An example script is provided as follows:
+
+```shell
+python eval.py \
+ --config_file "path to the config file" \
+ --battle_prompt_file "path to the prompt file for battle" \
+ --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
+ --target_file "path to the target answer file" \
+ --answer_file_list "path to the answer file" \
+ --model_name_list "the names of the model" \
+ --gpt_model "which GPT model to use for evaluation" \
+ --save_path "path to save results" \
+ --openai_key "your openai key" \
+```
+
+## More Details
+
+### Inference
+
+In the inference process, we will do generation, calculate loss over target tokens, calculate number of target tokens, softmax over given options (for example, "A", "B", "C", and "D") according to the inference arguments.
+
+For tokenization, we adopt tokenization strategy in [LongBench](https://github.com/THUDM/LongBench/blob/main/pred.py#L55) to preserve crucial instructions on the left and right side and keep all target tokens.
+
+For labeling target tokens, we adopt method from [FastChat](https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py#L137), but it doesn't always hold true due to tokenizers' different behavior. We plan to insert special tokens to correctly label the target tokens.
+
+For calculating loss, we return per-sample-loss instead of per-batch-loss if we directly use `model(batch).loss` provided in HuggingFace.
+
+### Evaluation
+
+To make it more easier to set the config, you only need to specify all metrics you want to use in key `metrics`. However, the program will only use a subset of metrics you give for different subcategories. Applying all metrics to all subcategories is obviously unsuitable. The suggested metrics for specific categories should be defined in `colossal_eval/evaluate/dataset_evaluator/metrics.py`.
+
+#### Metrics
+
+- `combined_single_choice_accuracy`: A combination of `first_token_logit` and `single_choice_accuracy`. If one of these is correct, the model will get the score. It can be used in all dataset that contains single-choice questions.
+- `first_token_logit`: Calculate score based on softmax score over the given choices. If the argmax of the softmax is equal to the reference, the model will get the score. If there is `NaN` in softmax score, it will calculate the score using exact match. It can be used in all dataset that contains single-choice questions.
+- `single_choice_accuracy`: Calculate score using exact match. It will only get the first uppercase letter such as A, B, C or D that is not surrouded by lowercase letters. If the uppercase letter is equal to the reference, the model will get the score. It can be used in all dataset that contains single-choice questions.
+- `multi_choice_accuracy`: Calculate score on multi-choice questions. It will get a set of all uppercase letters such as A, B, C or D that is not surrouded by lowercase letters. If the prediction conatains uppercase letters that are not in reference. The model will get 0 score. If the prediction contains a uppercase letter that is in reference, the model will get a score of `1/len(reference)`. It is used in AGIEval and GAOKAO-Bench.
+- `math_equivalence`: Code from [hendrycks](https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py). Compute scores over the prediction math formula and reference math formula. It is used in AGIEval and GAOKAO-Bench.
+- `f1_score`: Calculate English f1 score between prediction and reference. It is used in Longbench.
+- `f1_zh_score`: Calculate Chinese f1 score between prediction and reference. It is used in Longbench.
+- `rouge_score`: Calculate English f1 score between prediction and reference. It is used in GAOKAO-Bench and LongBench.
+- `rouge_zh_score`: Calculate Chinese rouge score between prediction and reference. It is used in GAOKAO-Bench and LongBench.
+- `retrieval_score`: Calculate English retrieval score between prediction and reference. It determines whether the ouput(which paragraph) corresponds to the given abstract. It is used in Longbench.
+- `retrieval_zh_score`: Calculate Chinese retrieval score between prediction and reference. It determines whether the ouput(which paragraph) corresponds to the given abstract. It is used in Longbench.
+- `classification_score`: Calculate classification score between prediction and reference. It determines whether the ouput(a class) is equal to the reference. It is used in Longbench.
+- `code_sim_score`: Calculate similarity score between prediction and reference. It is used in Longbench.
+- `count_score`: Calculate count score between prediction and reference. It determines whether the ouput(number of given passages) is equal to the reference. It is used in Longbench.
+- `perplexity`: Calculate perplexity. The formula is $ perplexity = \frac{1}{n} \sum_i e^{loss_i} $ where $n$ is the number of samples and $ loss_i $ is the average loss for sample $ i $. It can be used in all dataset.
+- `ppl_score`: Calculate perplexity score. The formula is $ ppl\_score = \frac{1}{n} \sum_i e^{-loss_i} $ where $n$ is the number of samples and $ loss_i $ is the average loss for sample $ i $. It can be used in all dataset.
+- `ppl_score_over_choices`: Calculate perplexity score over choices. The formula is $ ppl\_score\_over\_choices= \frac{1}{n} \sum_i e^{-loss\_over\_choices_i} $ where $n$ is the number of samples and $ loss\_over\_choices_i $ is the loss on the first predicted token for sample $ i $. It can be used in all dataset that contains single-choice questions.
+- `per_byte_perplexity`: Calculate per byte perplexity. The formula is $ \frac{1}{n} \sum_i e^{\frac{loss_i}{byte_i}} $ where $n$ is the number of samples, $ loss_i $ is the total loss for sample $ i $ and $ byte_i $ is the number of bytes sample $ i $ occupies. It can be used in all dataset.
+- `per_byte_ppl_score`: Calculate per byte perplexity score. The formula is $ \frac{1}{n} \sum_i e^{-\frac{loss_i}{byte_i}} $ where $n$ is the number of samples, $ loss_i $ is the total loss for sample $ i $ and $ byte_i $ is the number of bytes sample $ i $ occupies. It can be used in all dataset.
+
+We use `combined_single_choice_accuracy` and `first_token_logit` in the leaderboard.
+
+### Examples
+
+We provide 2 examples for you to explore our `colossal_eval` package.
+
+#### Dataset Evaluation Example
+
+This example is in folder `examples/dataset_evaluation`.
+
+1. `cd examples/dataset_evaluation`
+2. Fill in your inference config file in `config/inference/config.json`. Set the model and dataset parameters.
+3. Run `inference.sh` to get inference results.
+4. Fill in your evaluation config file in `config/evaluation/config.json`. Set the model and dataset parameters.
+5. Run `eval_dataset.sh` to get evaluation results.
+
+#### GPT Evaluation Example
+
+The examples is in folder `examples/gpt_evaluation`.
+
+1. `cd examples/gpt_evaluation`
+2. Fill in your inference config file in `config/inference/config.json`. Set the model and dataset parameters. If you want to use the example dataset we provide, the dataset is `ColossalDataset`.
+3. Run `inference.sh` to get inference results.
+4. Fill in your evaluation config file in `config/evaluation/config.json`.
+5. Run `eval.sh` to get evaluation results.
+
+## FAQ
+
+### How to Add a New Metric?
+
+If you want to add a customized metric, we recommend using `pip install -e .` to ensure that any changes you make to the source code will immediately affect the package you install.
+
+To add a new metric, you can follow the example of multi_choice_accuracy in line 339 in `colossal_eval/evaluate/dataset_evaluator/metric.py`. The method take one data sample's prediction and reference as input and return a score ranging from 0 to 1.
+
+A skeleton of code is the following.
+
+```python
+
+def CustomizedMetric(prediction: str, reference: str):
+ score = xxx
+ return score
+```
+
+Once you have successfully added your own metric, you should specify your metric both in `colossal_eval/evaluate/dataset_evaluator/metric.py` (suggest which subcategories shoule the metric be applied to) and your evaluation config.
+
+### How to Add a New Dataset?
+
+If you want to add customized dataset, we recommend using `pip install -e .` to ensure that any changes you make to the source code will immediately affect the package you install.
+
+To add a new dataset, you can follow the example of `colossal_eval/dataset/mmlu.py`. You need to make sure that the format of questions in one subcategory should be the same. For example, all questions should have target answers or all questions should be single-choice questions.
+
+A skeleton of code is the following.
+
+```python
+
+class CustomizedDataset(BaseDataset):
+ @staticmethod
+ def load():
+ # 1. Load and convert the original dataset format.
+ # 2. Assign inference arguments for each subcategory.
+ # 3. Return the converted dataset.
+ pass
+```
+
+Once you have successfully added your own dataset, you can specify your dataset class in your inference config.
+
+### How to Add a New Model?
+
+If you want to add customized models, we recommend using `pip install -e .` to ensure that any changes you make to the source code will immediately affect the package you install.
+
+To add a new model, you can follow the example of `colossal_eval/models/huggingface.py`. You need to provide a way to load the model and tokenizer, calculate loss and generate.
+
+A skeleton of code is the following.
+
+```python
+
+class CustomizedModel(BaseModel):
+ def __init__(self):
+ super().__init__()
+ self._load_tokenizer()
+ self._load_model()
+
+ def _load_tokenizer():
+ pass
+
+ def _load_model():
+ pass
+
+ def _calculate_loss():
+ pass
+
+ def get_loss():
+ self._calculate_loss()
+
+ def inference(samples):
+ # 1. Load samples from the same subcategory.
+ # 2. Infer in a batch way according to inference arguments.
+ # 3. Return results.
+ batch_samples = xxx
+ self.get_loss(batch_samples)
+ self.generate(batch_samples)
+
+ return inference_results
+
+ def generate():
+ pass
+```
+
+Once you have successfully added your own model, you can specify your model class in your inference config.
+
+## To do
+
+- [ ] Add visualization code for evaluation results on public dataset
+- [ ] Improve the way to label target tokens
+
+## Citations
+
+```bibtex
+@misc{zhong2023agieval,
+ title={AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
+ author={Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
+ year={2023},
+ eprint={2304.06364},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+
+@article{huang2023ceval,
+title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
+author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
+journal={arXiv preprint arXiv:2305.08322},
+year={2023}
+}
+
+@misc{li2023cmmlu,
+ title={CMMLU: Measuring massive multitask language understanding in Chinese},
+ author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
+ year={2023},
+ eprint={2306.09212},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+
+@inproceedings{Zhang2023EvaluatingTP,
+ title={Evaluating the Performance of Large Language Models on GAOKAO Benchmark},
+ author={Xiaotian Zhang and Chunyang Li and Yi Zong and Zhengyu Ying and Liang He and Xipeng Qiu},
+ year={2023}
+}
+
+@misc{bai2023longbench,
+ title={LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding},
+ author={Yushi Bai and Xin Lv and Jiajie Zhang and Hongchang Lyu and Jiankai Tang and Zhidian Huang and Zhengxiao Du and Xiao Liu and Aohan Zeng and Lei Hou and Yuxiao Dong and Jie Tang and Juanzi Li},
+ year={2023},
+ eprint={2308.14508},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+
+@article{hendryckstest2021,
+ title={Measuring Massive Multitask Language Understanding},
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+ year={2021}
+}
+
+@article{hendrycks2021ethics,
+ title={Aligning AI With Shared Human Values},
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+ year={2021}
+}
+
+@misc{zheng2023judging,
+ title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
+ author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
+ year={2023},
+ eprint={2306.05685},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+
+```
diff --git a/applications/ColossalEval/colossal_eval/__init__.py b/applications/ColossalEval/colossal_eval/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalEval/colossal_eval/dataset/__init__.py b/applications/ColossalEval/colossal_eval/dataset/__init__.py
new file mode 100644
index 000000000000..5b029e2673b1
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/__init__.py
@@ -0,0 +1,21 @@
+from .agieval import AGIEvalDataset
+from .base import BaseDataset
+from .ceval import CEvalDataset
+from .cmmlu import CMMLUDataset
+from .colossalai import ColossalDataset
+from .gaokaobench import GaoKaoBenchDataset
+from .longbench import LongBenchDataset
+from .mmlu import MMLUDataset
+from .mtbench import MTBenchDataset
+
+__all__ = [
+ "AGIEvalDataset",
+ "BaseDataset",
+ "CEvalDataset",
+ "CMMLUDataset",
+ "GaoKaoBenchDataset",
+ "LongBenchDataset",
+ "MMLUDataset",
+ "ColossalDataset",
+ "MTBenchDataset",
+]
diff --git a/applications/ColossalEval/colossal_eval/dataset/agieval.py b/applications/ColossalEval/colossal_eval/dataset/agieval.py
new file mode 100644
index 000000000000..92ebd65931ed
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/agieval.py
@@ -0,0 +1,247 @@
+# Adapted from https://github.com/ruixiangcui/AGIEval/blob/main/src/dataset_loader.py.
+
+import ast
+import glob
+import os
+from copy import deepcopy
+from typing import Dict, List
+
+import pandas as pd
+from colossal_eval.utils import get_json_list
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+# define the datasets
+english_qa_datasets = [
+ "lsat-ar",
+ "lsat-lr",
+ "lsat-rc",
+ "logiqa-en",
+ "sat-math",
+ "sat-en",
+ "aqua-rat",
+ "sat-en-without-passage",
+ "gaokao-english",
+]
+chinese_qa_datasets = [
+ "logiqa-zh",
+ "jec-qa-kd",
+ "jec-qa-ca",
+ "gaokao-chinese",
+ "gaokao-geography",
+ "gaokao-history",
+ "gaokao-biology",
+ "gaokao-chemistry",
+ "gaokao-physics",
+ "gaokao-mathqa",
+]
+english_cloze_datasets = ["math"]
+chinese_cloze_datasets = ["gaokao-mathcloze"]
+
+multi_choice_datasets = ["jec-qa-kd", "jec-qa-ca", "gaokao-physics", "gaokao-mathqa"]
+math_output_datasets = {"gaokao-mathcloze", "math"}
+
+default_inference_kwargs = {
+ "calculate_loss": True,
+ "all_classes": None,
+ "language": "Chinese",
+ "pretrain": False,
+ "max_new_tokens": 32,
+}
+
+
+def get_prompt(line: Dict, dataset_name: str, logger: DistributedLogger) -> Dict:
+ """Modified from https://github.com/microsoft/AGIEval/blob/main/src/dataset_loader.py#L190"""
+ try:
+ all_classes = None
+ passage = line["passage"] if line["passage"] is not None else ""
+
+ if dataset_name in english_qa_datasets:
+ option_string = "ABCDEFG"
+ count = len(line["options"])
+
+ input = (
+ "Question: "
+ + line["question"]
+ + " "
+ + "Choose from the following options: "
+ + " ".join(line["options"])
+ + "\n"
+ + "Answer: "
+ )
+
+ all_classes = list(option_string[0:count])
+
+ elif dataset_name in chinese_qa_datasets:
+ option_string = "ABCDEFG"
+ count = len(line["options"])
+
+ input = "问题:" + line["question"] + " " + "从以下选项中选择:" + " ".join(line["options"]) + "\n" + "答案:"
+
+ all_classes = list(option_string[0:count])
+
+ elif dataset_name in english_cloze_datasets:
+ input = "Question: " + line["question"] + "\n" + "Answer: "
+
+ elif dataset_name in chinese_cloze_datasets:
+ input = "问题:" + line["question"] + "\n" + "答案:"
+
+ return {
+ "instruction": input if not passage else passage + "\n\n" + input,
+ "target": line["label"] if line["label"] else line["answer"],
+ }, all_classes
+
+ except NameError:
+ logger.info("Dataset not defined.")
+
+
+# process few-shot raw_prompts
+def combine_prompt(prompt_path, dataset_name, load_explanation=True, chat_mode=False):
+ skip_passage = False
+ if dataset_name == "sat-en-without-passage":
+ skip_passage = True
+ dataset_name = "sat-en"
+ demostrations = []
+ # read the prompts by context and explanation
+ context_row = [0, 1, 3, 5, 7, 9]
+ explanation_row = [0, 2, 4, 6, 8, 10]
+ raw_prompts_context = pd.read_csv(
+ prompt_path, header=0, skiprows=lambda x: x not in context_row, keep_default_na=False
+ )
+ raw_prompts_explanation = pd.read_csv(
+ prompt_path, header=0, skiprows=lambda x: x not in explanation_row, keep_default_na=False
+ ).replace(r"\n\n", "\n", regex=True)
+ contexts = []
+ for line in list(raw_prompts_context[dataset_name]):
+ if line:
+ # print(line)
+ contexts.append(ast.literal_eval(line))
+ explanations = [exp for exp in raw_prompts_explanation[dataset_name] if exp]
+
+ for idx, (con, exp) in enumerate(zip(contexts, explanations)):
+ passage = con["passage"] if con["passage"] is not None and not skip_passage else ""
+ question = con["question"]
+ options = con["options"] if con["options"] is not None else ""
+ label = con["label"] if con["label"] is not None else ""
+ answer = con["answer"] if "answer" in con and con["answer"] is not None else ""
+
+ if dataset_name in english_qa_datasets:
+ question_input = (
+ "Question: "
+ + passage
+ + " "
+ + question
+ + "\n"
+ + "Choose from the following options: "
+ + " ".join(options)
+ + "\n"
+ + "Answer: {}".format(label)
+ )
+ elif dataset_name in chinese_qa_datasets:
+ question_input = (
+ "问题:" + passage + " " + question + "\n" + "从以下选项中选择:" + " ".join(options) + "\n" + "答案:{}".format(label)
+ )
+ elif dataset_name in english_cloze_datasets:
+ question_input = "Question: ".format(idx + 1) + question + "\n" + "Answer: {}".format(answer)
+ elif dataset_name in chinese_cloze_datasets:
+ question_input = "问题:" + question + "\n" + "答案:{}".format(answer)
+ else:
+ raise ValueError(f"During loading few-sot examples, found unknown dataset: {dataset_name}")
+
+ if chat_mode:
+ demostrations.append((question_input,))
+ else:
+ demostrations.append(question_input + "\n")
+
+ return demostrations
+
+
+class AGIEvalDataset(BaseDataset):
+ """
+ Dataset wrapper for AGIEval dataset.
+ Data source: https://github.com/microsoft/AGIEval
+ This dataset class will convert the original dataset into the inference dataset.
+
+ A few dirty data needed to be manually corrected in the origin dataset:
+ Issue link: https://github.com/microsoft/AGIEval/issues/16
+ 1. Invalid options in line 190 in gaokao-chemistry.jsonl.
+ 2. Option D (They may increase in value as those same resources become rare on Earth.) missing in line 17 in sat-en-without-passage.jsonl.
+ 3. Option D (They may increase in value as those same resources become rare on Earth.) missing in line 17 in sat-en.jsonl.
+ 4. Option D (No, because the data do not indicate whether the honeybees had been infected with mites.) missing in line 57 in sat-en-without-passage.jsonl.
+ 5. Option D (No, because the data do not indicate whether the honeybees had been infected with mites.) missing in line 57 in sat-en.jsonl.
+ 6. Option D (Published theories of scientists who developed earlier models of the Venus flytrap) missing in line 98 in sat-en-without-passage.jsonl.
+ 7. Option D (Published theories of scientists who developed earlier models of the Venus flytrap) missing in line 98 in sat-en.jsonl.
+ 8. Label is empty in line 212 in jec-qa-kd.jsonl. Content is also dirty.
+ 9. Actually, gaokao-mathqa.jsonl is also a multi-choice dataset. See line 149 286 287.
+ """
+
+ @staticmethod
+ def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+ dataset = {"test": {}}
+
+ files = glob.glob(os.path.join(path, "*.jsonl"))
+ files.sort()
+
+ if few_shot:
+ prompt_path = os.path.join(path, "few_shot_prompts.csv")
+
+ for file in files:
+ dataset_name = os.path.basename(file)[0 : -len(".jsonl")]
+
+ few_shot_data = []
+ if few_shot:
+ # process demo once if it is few-shot-CoT
+ few_shot_data = combine_prompt(prompt_path, dataset_name, load_explanation=False, chat_mode=False)
+
+ dataset["test"][dataset_name] = {"data": []}
+
+ file_dir = os.path.join(path, file)
+
+ loaded_jsonl = get_json_list(file_dir)
+
+ # It's been tested that each data sample in one subcategory have same inference arguments.
+ _, all_classes = get_prompt(loaded_jsonl[0], dataset_name, logger)
+ inference_kwargs = deepcopy(default_inference_kwargs)
+ if all_classes is not None and dataset_name not in multi_choice_datasets:
+ inference_kwargs["all_classes"] = all_classes
+
+ if dataset_name in english_qa_datasets:
+ inference_kwargs["language"] = "English"
+ if dataset_name in chinese_qa_datasets:
+ inference_kwargs["language"] = "Chinese"
+ inference_kwargs["few_shot_data"] = few_shot_data
+
+ dataset["test"][dataset_name]["inference_kwargs"] = inference_kwargs
+
+ for line in loaded_jsonl:
+ info, all_classes = get_prompt(line, dataset_name, logger)
+
+ # Convert multi-choice answers to a single string.
+ # We will convert it back when evaluating.
+ # We do this because if target is a list, it should be only used for multiple target answers.
+ if dataset_name in multi_choice_datasets:
+ if isinstance(info["target"], str) and len(info["target"]) > 1:
+ # "gaokao-mathqa" actually contain multi-choice questions.
+ # This if clause is specially used for it.
+ info["target"] = "".join(info["target"].split())
+ else:
+ info["target"] = "".join(info["target"])
+
+ if isinstance(info["target"], list) and len(info["target"]) == 1:
+ info["target"] = info["target"][0]
+
+ data_sample = {
+ "dataset": "agieval",
+ "split": "test",
+ "category": dataset_name,
+ "instruction": info["instruction"],
+ "input": "",
+ "output": "",
+ "target": info["target"],
+ }
+
+ dataset["test"][dataset_name]["data"].append(data_sample)
+
+ return dataset
diff --git a/applications/ColossalEval/colossal_eval/dataset/base.py b/applications/ColossalEval/colossal_eval/dataset/base.py
new file mode 100644
index 000000000000..45b0151b849f
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/base.py
@@ -0,0 +1,24 @@
+from abc import abstractstaticmethod
+
+from colossal_eval.utils import jdump
+
+
+class BaseDataset:
+ """
+ Base class for dataset wrapper.
+
+ Args:
+ path: The path to the original dataset.
+ logger: Logger for the dataset.
+ """
+
+ def __init__(self, path, logger, few_shot):
+ self.dataset = self.load(path, logger, few_shot)
+
+ def save(self, save_path):
+ """Save the converted dataset"""
+ jdump(self.dataset, save_path)
+
+ @abstractstaticmethod
+ def load(path, logger):
+ """Load the original dataset and convert it into the inference dataset"""
diff --git a/applications/ColossalEval/colossal_eval/dataset/ceval.py b/applications/ColossalEval/colossal_eval/dataset/ceval.py
new file mode 100644
index 000000000000..32ec52087bd3
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/ceval.py
@@ -0,0 +1,132 @@
+import copy
+import csv
+import os
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+ceval_subject_mapping = {
+ "computer_network": ["Computer Network", "计算机网络", "STEM"],
+ "operating_system": ["Operating System", "操作系统", "STEM"],
+ "computer_architecture": ["Computer Architecture", "计算机组成", "STEM"],
+ "college_programming": ["College Programming", "大学编程", "STEM"],
+ "college_physics": ["College Physics", "大学物理", "STEM"],
+ "college_chemistry": ["College Chemistry", "大学化学", "STEM"],
+ "advanced_mathematics": ["Advanced Mathematics", "高等数学", "STEM"],
+ "probability_and_statistics": ["Probability and Statistics", "概率统计", "STEM"],
+ "discrete_mathematics": ["Discrete Mathematics", "离散数学", "STEM"],
+ "electrical_engineer": ["Electrical Engineer", "注册电气工程师", "STEM"],
+ "metrology_engineer": ["Metrology Engineer", "注册计量师", "STEM"],
+ "high_school_mathematics": ["High School Mathematics", "高中数学", "STEM"],
+ "high_school_physics": ["High School Physics", "高中物理", "STEM"],
+ "high_school_chemistry": ["High School Chemistry", "高中化学", "STEM"],
+ "high_school_biology": ["High School Biology", "高中生物", "STEM"],
+ "middle_school_mathematics": ["Middle School Mathematics", "初中数学", "STEM"],
+ "middle_school_biology": ["Middle School Biology", "初中生物", "STEM"],
+ "middle_school_physics": ["Middle School Physics", "初中物理", "STEM"],
+ "middle_school_chemistry": ["Middle School Chemistry", "初中化学", "STEM"],
+ "veterinary_medicine": ["Veterinary Medicine", "兽医学", "STEM"],
+ "college_economics": ["College Economics", "大学经济学", "Social Science"],
+ "business_administration": ["Business Administration", "工商管理", "Social Science"],
+ "marxism": ["Marxism", "马克思主义基本原理", "Social Science"],
+ "mao_zedong_thought": ["Mao Zedong Thought", "毛泽东思想和中国特色社会主义理论体系概论", "Social Science"],
+ "education_science": ["Education Science", "教育学", "Social Science"],
+ "teacher_qualification": ["Teacher Qualification", "教师资格", "Social Science"],
+ "high_school_politics": ["High School Politics", "高中政治", "Social Science"],
+ "high_school_geography": ["High School Geography", "高中地理", "Social Science"],
+ "middle_school_politics": ["Middle School Politics", "初中政治", "Social Science"],
+ "middle_school_geography": ["Middle School Geography", "初中地理", "Social Science"],
+ "modern_chinese_history": ["Modern Chinese History", "近代史纲要", "Humanities"],
+ "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "思想道德修养与法律基础", "Humanities"],
+ "logic": ["Logic", "逻辑学", "Humanities"],
+ "law": ["Law", "法学", "Humanities"],
+ "chinese_language_and_literature": ["Chinese Language and Literature", "中国语言文学", "Humanities"],
+ "art_studies": ["Art Studies", "艺术学", "Humanities"],
+ "professional_tour_guide": ["Professional Tour Guide", "导游资格", "Humanities"],
+ "legal_professional": ["Legal Professional", "法律职业资格", "Humanities"],
+ "high_school_chinese": ["High School Chinese", "高中语文", "Humanities"],
+ "high_school_history": ["High School History", "高中历史", "Humanities"],
+ "middle_school_history": ["Middle School History", "初中历史", "Humanities"],
+ "civil_servant": ["Civil Servant", "公务员", "Other"],
+ "sports_science": ["Sports Science", "体育学", "Other"],
+ "plant_protection": ["Plant Protection", "植物保护", "Other"],
+ "basic_medicine": ["Basic Medicine", "基础医学", "Other"],
+ "clinical_medicine": ["Clinical Medicine", "临床医学", "Other"],
+ "urban_and_rural_planner": ["Urban and Rural Planner", "注册城乡规划师", "Other"],
+ "accountant": ["Accountant", "注册会计师", "Other"],
+ "fire_engineer": ["Fire Engineer", "注册消防工程师", "Other"],
+ "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "环境影响评价工程师", "Other"],
+ "tax_accountant": ["Tax Accountant", "税务师", "Other"],
+ "physician": ["Physician", "医师资格", "Other"],
+}
+
+default_inference_kwargs = {
+ "calculate_loss": False,
+ "all_classes": ["A", "B", "C", "D"],
+ "language": "Chinese",
+ "pretrain": False,
+ "max_new_tokens": 32,
+}
+
+
+def get_few_shot_data(data: List[Dict]):
+ few_shot_data = []
+ for i in data:
+ few_shot_data.append(i["input"] + i["target"])
+ return few_shot_data
+
+
+class CEvalDataset(BaseDataset):
+ """
+ Dataset class for CEval dataset.
+ Data source: https://huggingface.co/datasets/ceval/ceval-exam
+ This dataset class will convert the original dataset into the inference dataset.
+ """
+
+ @staticmethod
+ def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+ dataset = {"dev": {}, "test": {}}
+ for split in ["dev", "test"]:
+ files = os.listdir(os.path.join(path, split))
+ files.sort()
+
+ for file in files:
+ subject = file[0 : -len(f"_{split}.csv")]
+ subject = ceval_subject_mapping[subject][1]
+
+ file_dir = os.path.join(path, split, file)
+
+ dataset[split][subject] = {"data": []}
+
+ # It's been tested that each data sample in one subcategory have same inference arguments.
+ dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+
+ if split == "test" and few_shot:
+ dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+ dataset["dev"][subject]["data"]
+ )
+
+ with open(file_dir, encoding="utf-8") as f:
+ reader = csv.reader(f)
+ _ = next(reader)
+ for row in reader:
+ # Dev split have answer and explanation so len(row) is 8
+ # But test split doesn't contain answer and explanation, so len(row) is 6
+ assert len(row) >= 6
+ choices = f"A. {row[2]}\nB. {row[3]}\nC. {row[4]}\nD. {row[5]}"
+ data_sample = {
+ "dataset": "ceval",
+ "split": split,
+ "category": subject,
+ "instruction": f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。",
+ "input": f"题目:{row[1]}\n{choices}\n答案:",
+ "output": "",
+ "target": row[6] if split == "dev" else "",
+ "id": int(row[0]),
+ }
+
+ dataset[split][subject]["data"].append(data_sample)
+
+ return dataset
diff --git a/applications/ColossalEval/colossal_eval/dataset/cmmlu.py b/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
new file mode 100644
index 000000000000..51f8ca14e0c8
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
@@ -0,0 +1,144 @@
+import copy
+import csv
+import os
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+cmmlu_subject_mapping = {
+ "agronomy": "农学",
+ "anatomy": "解剖学",
+ "ancient_chinese": "古汉语",
+ "arts": "艺术学",
+ "astronomy": "天文学",
+ "business_ethics": "商业伦理",
+ "chinese_civil_service_exam": "中国公务员考试",
+ "chinese_driving_rule": "中国驾驶规则",
+ "chinese_food_culture": "中国饮食文化",
+ "chinese_foreign_policy": "中国外交政策",
+ "chinese_history": "中国历史",
+ "chinese_literature": "中国文学",
+ "chinese_teacher_qualification": "中国教师资格",
+ "clinical_knowledge": "临床知识",
+ "college_actuarial_science": "大学精算学",
+ "college_education": "大学教育学",
+ "college_engineering_hydrology": "大学工程水文学",
+ "college_law": "大学法律",
+ "college_mathematics": "大学数学",
+ "college_medical_statistics": "大学医学统计",
+ "college_medicine": "大学医学",
+ "computer_science": "计算机科学",
+ "computer_security": "计算机安全",
+ "conceptual_physics": "概念物理学",
+ "construction_project_management": "建设工程管理",
+ "economics": "经济学",
+ "education": "教育学",
+ "electrical_engineering": "电气工程",
+ "elementary_chinese": "小学语文",
+ "elementary_commonsense": "小学常识",
+ "elementary_information_and_technology": "小学信息技术",
+ "elementary_mathematics": "初等数学",
+ "ethnology": "民族学",
+ "food_science": "食品科学",
+ "genetics": "遗传学",
+ "global_facts": "全球事实",
+ "high_school_biology": "高中生物",
+ "high_school_chemistry": "高中化学",
+ "high_school_geography": "高中地理",
+ "high_school_mathematics": "高中数学",
+ "high_school_physics": "高中物理学",
+ "high_school_politics": "高中政治",
+ "human_sexuality": "人类性行为",
+ "international_law": "国际法学",
+ "journalism": "新闻学",
+ "jurisprudence": "法理学",
+ "legal_and_moral_basis": "法律与道德基础",
+ "logical": "逻辑学",
+ "machine_learning": "机器学习",
+ "management": "管理学",
+ "marketing": "市场营销",
+ "marxist_theory": "马克思主义理论",
+ "modern_chinese": "现代汉语",
+ "nutrition": "营养学",
+ "philosophy": "哲学",
+ "professional_accounting": "专业会计",
+ "professional_law": "专业法学",
+ "professional_medicine": "专业医学",
+ "professional_psychology": "专业心理学",
+ "public_relations": "公共关系",
+ "security_study": "安全研究",
+ "sociology": "社会学",
+ "sports_science": "体育学",
+ "traditional_chinese_medicine": "中医中药",
+ "virology": "病毒学",
+ "world_history": "世界历史",
+ "world_religions": "世界宗教",
+}
+
+default_inference_kwargs = {
+ "calculate_loss": True,
+ "all_classes": ["A", "B", "C", "D"],
+ "language": "Chinese",
+ "pretrain": False,
+ "max_new_tokens": 32,
+}
+
+
+def get_few_shot_data(data: List[Dict]):
+ few_shot_data = []
+ for i in data:
+ few_shot_data.append(i["input"] + i["target"])
+ return few_shot_data
+
+
+class CMMLUDataset(BaseDataset):
+ """
+ Dataset class for CMMLU dataset.
+ Data source: https://github.com/haonan-li/CMMLU/tree/master/data
+ This dataset class will convert the original dataset into the inference dataset.
+ """
+
+ @staticmethod
+ def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+ dataset = {"dev": {}, "test": {}}
+ for split in ["dev", "test"]:
+ files = os.listdir(os.path.join(path, split))
+ files.sort()
+
+ for file in files:
+ subject = file[0 : -len(".csv")]
+ subject = cmmlu_subject_mapping[subject]
+
+ file_dir = os.path.join(path, split, file)
+
+ dataset[split][subject] = {"data": []}
+
+ # It's been tested that each data sample in one subcategory have same inference arguments.
+ dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+
+ if split == "test" and few_shot:
+ dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+ dataset["dev"][subject]["data"]
+ )
+
+ with open(file_dir, encoding="utf-8") as f:
+ reader = csv.reader(f)
+ _ = next(reader)
+ for row in reader:
+ assert len(row) == 7
+ choices = f"A. {row[2]}\nB. {row[3]}\nC. {row[4]}\nD. {row[5]}"
+ data_sample = {
+ "dataset": "cmmlu",
+ "split": split,
+ "category": subject,
+ "instruction": f"以下是关于{subject}的单项选择题,请直接给出正确答案的选项。",
+ "input": f"题目:{row[1]}\n{choices}\n答案:",
+ "output": "",
+ "target": row[6],
+ }
+
+ dataset[split][subject]["data"].append(data_sample)
+
+ return dataset
diff --git a/applications/ColossalEval/colossal_eval/dataset/colossalai.py b/applications/ColossalEval/colossal_eval/dataset/colossalai.py
new file mode 100644
index 000000000000..54ea478ae5d6
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/colossalai.py
@@ -0,0 +1,70 @@
+from collections import defaultdict
+from copy import deepcopy
+from typing import Dict, List
+
+from colossal_eval.utils import jload
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+default_inference_kwargs = {
+ "calculate_loss": False,
+ "all_classes": None,
+ "language": "Chinese",
+ "pretrain": False,
+ "max_new_tokens": 256,
+}
+
+# You can add your own subcategory questions and specify whether it is a single-choice question or has target answers and need to calculate loss.
+single_choice_question = set()
+calculate_loss = set()
+
+
+def get_data_per_category(data):
+ data_per_category = defaultdict(list)
+ for item in data:
+ category = item["category"]
+ data_per_category[category].append(item)
+
+ return data_per_category
+
+
+class ColossalDataset(BaseDataset):
+ """
+ Dataset class for Colossal dataset.
+ This dataset class will convert the original dataset into the inference dataset.
+ """
+
+ @staticmethod
+ def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+ dataset = {"test": {}}
+ data = jload(path)
+ data_per_category = get_data_per_category(data)
+ categories = list(data_per_category.keys())
+
+ for category in categories:
+ dataset["test"][category] = {"data": []}
+ category_data = data_per_category[category]
+
+ dataset["test"][category]["inference_kwargs"] = deepcopy(default_inference_kwargs)
+
+ if category in calculate_loss:
+ dataset["test"][category]["inference_kwargs"]["calculate_loss"] = True
+ if category in single_choice_question:
+ dataset["test"][category]["inference_kwargs"]["all_classes"] = ["A", "B", "C", "D"]
+
+ for item in category_data:
+ data_sample = {
+ "dataset": "colossal",
+ "split": "test",
+ "category": category,
+ "instruction": item["instruction"],
+ "input": item["input"],
+ "output": "",
+ "target": item["target"],
+ "id": item["id"],
+ }
+ dataset["test"][category]["data"].append(data_sample)
+
+ return dataset
diff --git a/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py b/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
new file mode 100644
index 000000000000..7bf0639e4882
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
@@ -0,0 +1,122 @@
+import json
+import os
+import re
+from copy import deepcopy
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+multi_choice_datasets = [
+ "Chinese Lang and Usage MCQs",
+ "Chinese Modern Lit",
+ "English Fill in Blanks",
+ "English Reading Comp",
+ "Geography MCQs",
+ "Physics MCQs",
+ "English Cloze Test",
+]
+
+chinese_qa_datasets = [
+ "Biology MCQs",
+ "Chemistry MCQs",
+ "Chinese Lang and Usage MCQs",
+ "Chinese Modern Lit",
+ "Geography MCQs",
+ "History MCQs",
+ "Math I MCQs",
+ "Math II MCQs",
+ "Physics MCQs",
+ "Political Science MCQs",
+]
+english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]
+
+default_inference_kwargs = {
+ "calculate_loss": True,
+ "all_classes": None,
+ "language": "Chinese",
+ "pretrain": False,
+ "max_new_tokens": 32,
+}
+
+
+def get_all_classes(instruction: str):
+ letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ pattern = r"([A-Z]\. |[A-Z].|[A-Z]\.)"
+ options = sorted(list(set(re.findall(pattern, instruction))))
+ options = sorted(list(set([string[0] for string in options])))
+
+ for i in range(len(options)):
+ if options[i] == letters[i]:
+ continue
+ else:
+ return options[0:i]
+ return options
+
+
+class GaoKaoBenchDataset(BaseDataset):
+ """
+ Dataset class for GAOKAO-Bench dataset.
+ Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
+ This dataset class will convert the original dataset into the inference dataset.
+
+ A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
+ Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
+ 1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
+ 2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
+ 3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
+ """
+
+ @staticmethod
+ def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+ dataset = {"test": {}}
+ for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
+ files = os.listdir(os.path.join(path, "data", category))
+ files.sort()
+
+ for file in files:
+ subject = file[10:-5].split("_")
+ subject = " ".join(subject)
+ dataset["test"][subject] = {"data": []}
+
+ file_dir = os.path.join(path, "data", category, file)
+
+ with open(file_dir, encoding="utf-8") as f:
+ data = json.load(f)
+
+ # It's been tested that each data sample in one subcategory have same inference arguments.
+ inference_kwargs = deepcopy(default_inference_kwargs)
+ if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
+ all_classes = get_all_classes(data["example"][0]["question"])
+ inference_kwargs["all_classes"] = all_classes
+ if subject in english_qa_datasets:
+ inference_kwargs["language"] = "English"
+ if subject in chinese_qa_datasets:
+ inference_kwargs["language"] = "Chinese"
+
+ dataset["test"][subject]["inference_kwargs"] = inference_kwargs
+
+ for sample in data["example"]:
+ # Convert multi-choice answers to a single string.
+ # We will convert it back when evaluating.
+ # We do this because if target is a list, it should be only used for multiple target answers.
+ if subject in multi_choice_datasets:
+ sample["answer"] = "".join(sample["answer"])
+
+ if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
+ sample["answer"] = sample["answer"][0]
+
+ data_sample = {
+ "dataset": "gaokaobench",
+ "split": "test",
+ "category": f"{category[:-10]}-{subject}",
+ "instruction": sample["question"].strip() + "\n答案:",
+ "input": "",
+ "output": "",
+ "target": sample["answer"],
+ }
+
+ dataset["test"][subject]["data"].append(data_sample)
+
+ return dataset
diff --git a/applications/ColossalEval/colossal_eval/dataset/longbench.py b/applications/ColossalEval/colossal_eval/dataset/longbench.py
new file mode 100644
index 000000000000..9ea5e3c7d77f
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/longbench.py
@@ -0,0 +1,120 @@
+import os
+from copy import deepcopy
+from typing import Dict, List
+
+from colossal_eval.utils import get_json_list
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+dataset2prompt = {
+ "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
+ "qasper": 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
+ "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+ "multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:",
+ "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+ "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+ "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+ "dureader": "请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:",
+ "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
+ "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
+ "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
+ "vcsum": "下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:",
+ "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
+ "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
+ "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
+ "lsht": "请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}",
+ "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
+ "passage_retrieval_en": 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
+ "passage_retrieval_zh": '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
+ "lcc": "Please complete the code given below. \n{context}Next line of code:\n",
+ "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n",
+}
+
+dataset2maxlen = {
+ "narrativeqa": 128,
+ "qasper": 128,
+ "multifieldqa_en": 64,
+ "multifieldqa_zh": 64,
+ "hotpotqa": 32,
+ "2wikimqa": 32,
+ "musique": 32,
+ "dureader": 128,
+ "gov_report": 512,
+ "qmsum": 512,
+ "multi_news": 512,
+ "vcsum": 512,
+ "trec": 64,
+ "triviaqa": 32,
+ "samsum": 128,
+ "lsht": 64,
+ "passage_count": 32,
+ "passage_retrieval_en": 32,
+ "passage_retrieval_zh": 32,
+ "lcc": 64,
+ "repobench-p": 64,
+}
+
+default_inference_kwargs = {
+ "calculate_loss": True,
+ "all_classes": None,
+ "language": "Chinese",
+ "pretrain": False,
+ "max_new_tokens": 32,
+}
+
+
+class LongBenchDataset(BaseDataset):
+ """
+ Dataset class for LongBench dataset.
+ Data source: https://huggingface.co/datasets/THUDM/LongBench
+ This dataset class will convert the original dataset into the inference dataset.
+
+ Issue link: https://github.com/THUDM/LongBench/issues/15 (fixed)
+ There are duplicate target answers in `nq.jsonl`, but this doesn't affect evaluation results.
+ Also doesn't affect perplexity calculation (the program only need to select the minimum loss).
+ """
+
+ @staticmethod
+ def load(path: str, logger: DistributedLogger) -> List[Dict]:
+ dataset = {"test": {}}
+
+ files = os.listdir(path)
+ files.sort()
+
+ for file in files:
+ category = file[0:-6]
+
+ if category.endswith("_e"):
+ continue
+
+ dataset["test"][category] = {"data": []}
+
+ file_dir = os.path.join(path, file)
+
+ loaded_jsonl = get_json_list(file_dir)
+
+ # It's been tested that each data sample in one subcategory have same inference arguments.
+ inference_kwargs = deepcopy(default_inference_kwargs)
+ if loaded_jsonl[0]["all_classes"] is not None:
+ inference_kwargs["all_classes"] = loaded_jsonl[0]["all_classes"]
+ inference_kwargs["max_new_tokens"] = dataset2maxlen[category]
+ dataset["test"][category]["inference_kwargs"] = inference_kwargs
+
+ for sample in loaded_jsonl:
+ prompt = dataset2prompt[category].format(**sample)
+
+ data_sample = {
+ "dataset": "longbench",
+ "split": "test",
+ "category": category,
+ "instruction": prompt,
+ "input": "",
+ "output": "",
+ "target": sample["answers"],
+ }
+
+ dataset["test"][category]["data"].append(data_sample)
+
+ return dataset
diff --git a/applications/ColossalEval/colossal_eval/dataset/mmlu.py b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
new file mode 100644
index 000000000000..b89c0a13cff1
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
@@ -0,0 +1,73 @@
+import copy
+import csv
+import os
+from typing import Dict, List
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+default_inference_kwargs = {
+ "calculate_loss": True,
+ "all_classes": ["A", "B", "C", "D"],
+ "language": "English",
+ "pretrain": False,
+ "max_new_tokens": 32,
+}
+
+
+def get_few_shot_data(data: List[Dict]):
+ few_shot_data = []
+ for i in data:
+ few_shot_data.append(i["input"] + i["target"])
+ return few_shot_data
+
+
+class MMLUDataset(BaseDataset):
+ """
+ Dataset class for MMLU dataset.
+ Data source: https://github.com/hendrycks/test
+ This dataset class will convert the original dataset into the inference dataset.
+ """
+
+ @staticmethod
+ def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+ dataset = {"dev": {}, "test": {}}
+ for split in ["dev", "test"]:
+ files = os.listdir(os.path.join(path, split))
+ files.sort()
+
+ for file in files:
+ subject = file[0 : -len(f"_{split}.csv")].split("_")
+ subject = " ".join([word.title() if word != "us" else "US" for word in subject])
+
+ file_dir = os.path.join(path, split, file)
+
+ dataset[split][subject] = {"data": [], "inference_kwargs": {}}
+
+ # It's been tested that each data sample in one subcategory have same inference arguments.
+ dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+
+ if split == "test" and few_shot:
+ dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+ dataset["dev"][subject]["data"]
+ )
+
+ with open(file_dir, encoding="utf-8") as f:
+ reader = csv.reader(f)
+ for row in reader:
+ assert len(row) == 6
+ choices = f"A. {row[1]}\nB. {row[2]}\nC. {row[3]}\nD. {row[4]}"
+ data_sample = {
+ "dataset": "mmlu",
+ "split": split,
+ "category": subject,
+ "instruction": f"The following is a single-choice question on {subject}. Answer the question by replying A, B, C or D.",
+ "input": f"Question: {row[0]}\n{choices}\nAnswer: ",
+ "output": "",
+ "target": row[5],
+ }
+
+ dataset[split][subject]["data"].append(data_sample)
+
+ return dataset
diff --git a/applications/ColossalEval/colossal_eval/dataset/mtbench.py b/applications/ColossalEval/colossal_eval/dataset/mtbench.py
new file mode 100644
index 000000000000..9e74a4d826e3
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/dataset/mtbench.py
@@ -0,0 +1,72 @@
+import copy
+import json
+import os
+from collections import defaultdict
+from typing import Dict, List
+
+from colossal_eval.utils import get_json_list
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+default_inference_kwargs = {
+ "calculate_loss": False,
+ "all_classes": None,
+ "language": "English",
+ "pretrain": False,
+ "max_new_tokens": 1024,
+ "turns": 2,
+}
+
+
+class MTBenchDataset(BaseDataset):
+ """
+ Dataset class for mt_bench dataset.
+ Data source: https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/data/mt_bench/question.jsonl
+ This dataset class will convert the original dataset into the inference dataset.
+ """
+
+ def __init__(self, path, logger, few_shot):
+ self.multiturn = True
+ self.dataset = self.load(path, logger, few_shot)
+
+ @staticmethod
+ def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+ dataset = {"test": defaultdict(dict)}
+
+ file_path = os.path.join(path, "question.jsonl")
+ ref_path = os.path.join(path, "reference_answer/gpt-4.jsonl")
+
+ reference = defaultdict(list)
+ ref_origin = get_json_list(ref_path)
+ for ref in ref_origin:
+ reference[ref["question_id"]] = ref["choices"][0]["turns"]
+
+ with open(file_path, "r", encoding="utf-8") as file:
+ for line in file:
+ question = json.loads(line)
+ category = question["category"]
+ turn_number = len(question["turns"])
+ data_point = {
+ "id": question["question_id"],
+ "dataset": "mtbench",
+ "split": "test",
+ "category": category,
+ "instruction": question["turns"],
+ "input": "",
+ "output": [],
+ "target": [""] * turn_number
+ if question["question_id"] not in reference
+ else reference[question["question_id"]],
+ }
+
+ if category in dataset["test"]:
+ dataset["test"][category]["data"].append(data_point)
+ else:
+ dataset["test"][category] = {
+ "data": [data_point],
+ "inference_kwargs": copy.deepcopy(default_inference_kwargs),
+ }
+
+ return dataset
diff --git a/applications/ColossalEval/colossal_eval/evaluate/GPT Evaluation.md b/applications/ColossalEval/colossal_eval/evaluate/GPT Evaluation.md
new file mode 100644
index 000000000000..37fbda4c8647
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/evaluate/GPT Evaluation.md
@@ -0,0 +1,248 @@
+# GPT Evaluation
+## Table of Contents
+- [Overview](#overview)
+- [GPT Evaluation](#gpt-evaluation)
+ - [Evaluation Category](#evaluation-category)
+ - [Evaluation Category Examples](#evaluation-category-examples)
+ - [Evaluation Metrics](#evaluation-metrics)
+- [Evaluation Process](#evaluation-process)
+ - [Data Format](#data-format)
+ - [Prompt](#prompt)
+ - [Battle Prompt](#battle-prompt)
+ - [Evaluation Prompt](#evaluation-prompt)
+ - [Evaluation](#evaluation)
+ - [Configuration](#configuration)
+ - [Evaluate](#evaluate)
+- [FAQ](#faq)
+- [Citations](#citations)
+
+
+## Overview
+
+In this directory, we introduce how you can evaluate your model using GPTs. It is now available for evaluation of both Chinese and English capability and we provide the following functions:
+
+* Compare the performance of two different models (battle).
+* Rate the model according to pre-defined metrics using prompting design.
+* Rate the model according to pre-defined metrics with additional reference answer using prompting design.
+
+## GPT Evaluation
+
+### Evaluation Category
+
+Our evaluation pipeline can examine the model's capability using different categories of questions. The following table includes some example categories. You can add your own questions.
+
+| Evaluation Category | Description |
+| :-----------------: | :----------------------------------------------------------- |
+| Brainstorming | Models are asked to generate a range of creative and diverse ideas according to the question. The capability of creativity is required. |
+| Chat | Models are asked to continue a multi-round dialogue given the roles involved. The capability of understanding, memorizing previous rounds of the dialogue and answering according to the persona provided is required. |
+| Generation | Models are asked to generate an email, letter, article, etc. The capability of generating texts in a high quality and human-written way is required. |
+| Open QA | Models are asked to answer an open QA question(without context provided). The capability of answering questions with the models' own knowledge base is required. |
+| Roleplay | Models are asked to play the role provided. The capability of engaging in the scenario and effectively interacting with the user is required. |
+
+
+### Evaluation Category Examples
+To better understand each evaluation category, here are some example questions provided. Example questions are in the `configs/gpt_evaluation/data` folder.
+
+
+| Evaluation Category | Chinese Example | English Example |
+| :-----------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
+| Brainstorming | 列举一些可以促进头发生长的食物。 | How do you properly chop an onion without crying? |
+| Chat | 基于以下角色信息完成一段对话。小张是一名新手爱好者,对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。 小张:您好,老李,我最近开始对养鸡感兴趣了,想请教您一些问题。 老李:你好,小张,我很乐意帮助你。你想问些什么? 小张:我想知道如何确定鸡的品种和性别? 老李:确切的品种可以通过鸡的外貌特征来确定,而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗? 小张: | Complete a dialogue based on the following character information. Alex: A novice writer who is struggling to find inspiration and develop his writing skills. Emma: A successful author with many published works, providing guidance and advice to Alex. Alex: Hi Emma, I have been writing for a while now but can't seem to make any progress. Can you give me any advice? Emma: Hi Alex, sure. What kind of writing are you doing? Alex: I'm trying to write a novel, but I just can't seem to find any inspiration. Emma: |
+| Generation | 请为一家咖啡店编写一篇简短的广告语,吸引更多的顾客。 | Write a set of guidelines for first-time pet owners on how to properly care for a new puppy. |
+| Open QA | 解释什么是RNA病毒和DNA病毒。 | Explain the process of osmosis in biological systems. |
+| Roleplay | 我要你把我写的句子翻译成表情符号。我会写句子,你会用表情符号表达它。我只是想让你用表情符号来表达它。除了表情符号,我不希望你回复任何内容。当我需要用中文告诉你一些事情时,我会用 {} 这样的大括号括起来。我的第一句话是“{我的职业是消防员。}” | I want you to act as a rapper. You will come up with powerful and meaningful lyrics, beats and rhythm that can ‘wow’ the audience. Your lyrics should have an intriguing meaning and message which people can relate too. When it comes to choosing your beat, make sure it is catchy yet relevant to your words, so that when combined they make an explosion of sound everytime! My first request is "I need a rap song about finding strength within yourself." |
+
+### Evaluation Metrics
+
+GPT evaluation uses GPT models to evaluate the prediction of different models and different pre-defined evaluation metrics are applied to different categories. The following table shows the 10 pre-defined evaluation metrics both in Chinese and English:
+
+| Evaluation Metric | Prompt Words | CoT(Chain-of-Thought) |
+| :-------------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
+| 语言组织 (Language organization) | 语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc. | 1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。 2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说 3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。 4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。 5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。 6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes. 2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory. 3. Determine if the answer is relevant to the question or topic and conveys a clear message. 4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs. 5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information. 6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization. |
+| 切题 (Relevance) | 切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic. | 1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。 2. 阅读答案,确认答案是否直接回答了题目所问的问题。 3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。 4. 根据以上因素综合评估答案的切题程度,并给出一个1到5的分数,其中5表示答案非常切题,而1表示答案完全没有切题。1. Read the question to determine what the question asks and what aspects of the question need to be answered. 2. Read the answers to make sure that they directly answer the question asked. 3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc. 4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all. |
+| 创意性 (Creativity) | 创意性(1-5):某些头脑风暴问题可能需要答案具有创意,提出新的思路。Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas. | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则创意性评分可能会受到影响。 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠,但仍然可以被认为是有创意的,只要它提供了新的角度或方法来解决问题。 4. 根据答案的创意性,给出一个1到5的评分。如果答案缺乏创意,则应给出一个较低的评分。如果答案具有创意并提供了新的思路,应给出一个较高的评分。1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions. 2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected. 3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem. 4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given. |
+| 实用性 (Practicality) | 实用性(1-5):某些头脑风暴问题可能需要答案提出实用的建议或解决方法。Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions. | 1. 仔细阅读所提供的头脑风暴问题,确保你理解问题的要点和背景。 2. 根据你的知识和经验,判断所提供的答案是否可行。如果答案不可行,则实用性评分可能会受到影响。 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好,但如果无法实现或应用,则实用性评分可能会受到影响。 4. 根据答案的实用性,给出一个1到5的评分。如果答案缺乏实用性,则应给出一个较低的评分。如果答案提出了实用的建议或解决方法,并且可以很好地解决问题,则应给出一个较高的评分。1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions. 2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected. 3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected. 4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given. |
+| 正确性 (Correctness) | 正确性(1-5):正确性(1-5):答案是否正确。 Correctness (1-5): whether the answer is correct or not. | 1. 仔细阅读题目,尝试自己回答该问题。 2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的,则可以将正确性得分为5分。如果答案是部分正确的,则可以给予适当的得分,例如2分、3分或4分。如果答案完全不正确,则只得1分。
1. Read the question carefully and try to answer the question yourself. 2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded. |
+| 自然 (Naturalness) | 自然(1-5):答案是否自然,并且符合问题给定的身份。Naturalness (1-5): whether the answer is natural and fits the identity given by the question. | 1. 阅读题目,确定题目提供的身份信息。 2. 检查答案内容是否符合题目给定的身份。 3. 根据以上因素,对该回答的自然性进行打分,分数从1到5,其中1表示不自然,5表示非常自然,并符合问题给定的身份。1. Read the question and determine the identity information provided in the question. 2. Check whether the content of the answer matches the identity given in the question. 3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question. |
+| 参与感 (Engagingness) | 参与感(1-5):答案是否对前面的对话内容做出了恰当的反应,是否理解对话的语境和背景。Engagingness (1-5): whether the answer responds appropriately to the content of the preceding conversation and whether it understands the context and background of the conversation. | 1. 阅读题目,确定对话的语境和背景。 2. 检查答案是否充分理解对话的语境和背景,能否自然地融入到对话中而不显得突兀。 3. 根据以上因素,对该回答的参与感进行打分,分数从1到5,其中1表示没有参与感,5表示非常有参与感,并且恰当地理解了对话的语境和背景。1. Read the questions to determine the context and background of the dialogue. 2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt. 3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation. |
+| 合理性 (Reasonableness) | 合理性(1-5):答案是否能够与前面的对话内容形成逻辑上的衔接,是否符合常理,能否在这个上下文中合理存在。Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context. | 1. 阅读题目,确定对话的主题以及问题期望的回答方向。 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接,是否符合常理,能否在这个上下文中合理存在。 3. 根据以上因素,对该回答的合理性进行打分,分数从1到5,其中1表示不合理,5表示非常合理,并且能够与前面的对话内容形成逻辑上的衔接,并符合常理。1. Read the question and determine the topic of the conversation and the direction the question expects the answer to go. 2. Determine whether the answer can be logically connected to the preceding conversation, whether it makes common sense, and whether it can reasonably exist in this context. 3. Based on the above factors, rate the reasonableness of the answer on a scale from 1 to 5, where 1 means unreasonable and 5 means very reasonable and able to form a logical connection with the preceding dialogue content and consistent with common sense. |
+| 多样性 (Diversity) | 多样性(1-5):答案使用语言是否优美,具有有一定的创造性和想象力。然而,回答也应该保持合理和适度,不要过于夸张或离题。Diversity (1-5): Whether the answers use beautiful language and have some creativity and imagination. However, answers should also be kept reasonable and moderate, not overly exaggerated or off-topic. | 1. 仔细阅读整个回答,确保完全理解回答所表达的内容和主题。 2. 在阅读回答的同时,注意语言的质量,例如措辞是否正确,语言是否生动等。 3. 检查回答的创造性和想象力,看看回答是否能够吸引人阅读下去。 4. 检查回答的合理性和适度,看看回答是否夸张或离题。5. 将多样性的评分打分在1到5之间,5分表示回答的质量很好,能够吸引人阅读,1分表示回答的内容生硬或者有离题的问题。1. Read the entire response carefully to ensure that you fully understand the content and theme expressed in the response. 2. While reading the response, pay attention to the quality of the language, such as whether the wording is correct and the language is vivid. 3. Check the creativity and imagination of the response to see if the response is engaging to read on. 4. Check the reasonableness and appropriateness of the responses to see if the responses are exaggerated or off-topic. 5. Rate the diversity on a scale of 1 to 5, with a 5 indicating a good quality response that is engaging to read and a 1 indicating a raw response or a question that is off-topic. |
+| 保真度 (Fidelity) | 保真度(1-5):答案是否能够严格遵守角色的设定回答给定的请求。Fidelity (1-5): whether the answer is able to answer the given request in strict compliance with the role setting. | 1. 仔细阅读问题,了解角色在问题中的设定和表现,包括职业、背景、观点、性格等方面。 阅读题目的请求,确认回答请求时需要注意的细节。 3. 对比提供的回答与该角色的设定,评估回答是否能够严格遵守角色的设定。 4. 结合以上评估结果给出保真度的评分,范围从1到5分,其中1分表示回答与角色设定完全不符,5分表示回答完全符合角色设定且满足给定请求。1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality. 2. Read the question's request and confirm the details that need to be taken into account when answering the request. 3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role. 4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request. |
+
+GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
+
+> **NOTE 1:** You can find all the prompt words and CoT(Chain-of-Thought) in `configs/gpt_evaluation/prompt/evaluation_prompt`.
+
+> **NOTE 2:** To add customized metrics, you can refer to [FAQ](#faq).
+
+## Evaluation Process
+
+### Data Format
+
+A JSON file contains one list. Each element in the list is a target answer / prediction record for one instruction / question.
+An element should have the following fields:
+
+* `category` (str, compulsory): The category of the instruction / question.
+* `instruction` (str, compulsory): The instruction / question for the LLM.
+* `input` (str, optional): The additional context of the instruction / question.
+* `output` (str, optional): The model output of the instruction, models will fill in this field during inference time.
+* `target` (str, optional): The target answer for the instruction.
+* `id` (int, compulsory): The ID of the instruction / question.
+
+Example:
+
+```json
+[
+ {
+ "category": "brainstorming",
+ "instruction": "请问如何制作一份美味的西红柿炒鸡蛋?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 1
+ },
+ {
+ "category": "chat",
+ "instruction": "基于以下角色信息完成一段对话。小张是一名新手爱好者,对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。",
+ "input": "小张:您好,老李,我最近开始对养鸡感兴趣了,想请教您一些问题。 老李:你好,小张,我很乐意帮助你。你想问些什么? 小张:我想知道如何确定鸡的品种和性别? 老李:确切的品种可以通过鸡的外貌特征来确定,而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗? 小张:",
+ "output": "",
+ "target": "",
+ "id": 2
+ }
+]
+```
+
+### Prompt
+
+#### Battle Prompt
+
+The following is the Chinese battle prompt. In the battle prompt, the question and answers from two different models are fed into the prompt template. You can find example battle prompt files for Chinese and English in `configs/gpt_evaluation/prompt/battle_prompt`.
+
+```json
+{
+ "id": 1,
+ "system_prompt": "你是一个检查回答质量的好助手。",
+ "prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答 案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
+ "prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分,分数越高表示整体表现越好。\n请首先输出一行,该行只包含两个数值,分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中,请对你的评价作出全面的解释,避免任何潜在的偏见,并确保AI助手回答的顺序不会影响您的判断。"
+}
+```
+
+#### Evaluation Prompt
+
+The following is an example of a Chinese GPT evaluation prompt. In an evaluation prompt, you should define your metrics in `metrics` and provide CoT(Chain-of-Thought) in `CoT`. You can find example evaluation prompt files for Chinese and English in `configs/gpt_evaluation/prompt/evaluation_prompt`.
+
+```json
+{
+ "brainstorming": {
+ "id": 1,
+ "category": "brainstorming",
+ "metrics": {
+ "language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。"
+ },
+ "CoT": {
+ "language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。\n\n语言组织:"
+ },
+ "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
+ }
+}
+```
+
+`"metrics"`: the metrics that can be used in GPT evaluation. This field determines which metrics can be added to your config file.
+
+`"CoT"`: evaluation steps you prompt to GPT models for each metric defined in `"metrics"`.
+
+### Evaluation
+
+#### Configuration
+
+The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics in key `GPT`. You can find an example English config file in `configs/gpt_evaluation/config/config_en.json`.
+
+```json
+{
+ "language": "cn",
+ "category": {
+ "brainstorming": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "creativity",
+ "practicality",
+ "reasonableness"
+ ]
+ }
+ }
+}
+```
+
+`"language"`: the language used to evaluate the model capability. We only support Chinese `"cn"` for now.
+
+`"category"`: the category/categories needed to evaluate the model capability.
+
+`"GPT"`: the metrics you want to use for GPT evaluation.
+
+
+#### Evaluate
+
+After setting the configuration file, you can evaluate the model using `examples/gpt_evaluation/eval.py`. If you want to make comparisons between answers of two different models, you should specify two answer files in the argument `answer_file_list` and two model names in the argument `model_name_list`. If you want to evaluate one answer file, the length of both `answer_file_list` and `model_name_list` should be 1 and the program will perform evaluation using automatic metrics and GPT models.
+
+An example script is provided as follows:
+
+```shell
+python eval.py \
+ --config_file "path to the config file" \
+ --battle_prompt_file "path to the prompt file for battle" \
+ --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
+ --target_file "path to the target answer file" \
+ --answer_file_list "path to the answer files of at most 2 models" \
+ --model_name_list "the names of at most 2 models" \
+ --gpt_model "which GPT model to use for evaluation" \
+ --save_path "path to save results" \
+ --openai_key "your openai key" \
+```
+
+If you want GPT evaluation with reference, you can add an argument `--gpt_with_reference`, but make sure the reference file have target answers.
+
+## FAQ
+
+How can I add a new GPT evaluation metric?
+
+For example, if you want to add a new metric `persuasiveness` into category `brainstorming`, you should add the metric definition and its corresponding CoT(Chain-of-thought) in the evaluation prompt file in `prompt/evaluation_promt`. The CoT can be generated using ChatGPT. You can prompt ChatGPT to generate evaluation steps for the new metric.
+
+```json
+{
+ "brainstorming": {
+ "id": 1,
+ "category": "brainstorming",
+ "metrics": {
+ "persuasiveness": "persuasiveness(1-5):a short description for persuasiveness"
+ },
+ "CoT": {
+ "persuasiveness": "CoT for persuasiveness\n\npersuasiveness:"
+ },
+ "prompt": "You are a good assistant. Please rate the given answer to the \"brainstorming\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+ }
+}
+```
+
+
+
+## Citations
+
+```bibtex
+@misc{vicuna2023,
+ title = {Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90\%* ChatGPT Quality},
+ url = {https://vicuna.lmsys.org},
+ author = {Chiang, Wei-Lin and Li, Zhuohan and Lin, Zi and Sheng, Ying and Wu, Zhanghao and Zhang, Hao and Zheng, Lianmin and Zhuang, Siyuan and Zhuang, Yonghao and Gonzalez, Joseph E. and Stoica, Ion and Xing, Eric P.},
+ month = {March},
+ year = {2023}
+}
+
+@misc{liu2023geval,
+ title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
+ author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
+ year={2023},
+ eprint={2303.16634},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
diff --git a/applications/ColossalEval/colossal_eval/evaluate/__init__.py b/applications/ColossalEval/colossal_eval/evaluate/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/__init__.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/__init__.py
new file mode 100644
index 000000000000..3c5df09a6909
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/__init__.py
@@ -0,0 +1,3 @@
+from .dataset_evaluator import DatasetEvaluator
+
+__all__ = ["DatasetEvaluator"]
diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py
new file mode 100644
index 000000000000..57ccd1aa6a1e
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py
@@ -0,0 +1,307 @@
+import os
+from typing import Dict, List
+
+import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper
+import numpy as np
+import tqdm
+from colossal_eval.utils import jdump
+
+LabelBasedMetrics = ["first_token_accuracy", "matthews_correlation"]
+LossBasedMetrics = ["perplexity", "ppl_score", "ppl_score_over_choices", "per_byte_perplexity", "per_byte_ppl_score"]
+CombinedMetrics = ["combined_single_choice_accuracy"]
+GPTMetrics = ["mtbench_single_judge"]
+OtherMetrics = [
+ "f1_score",
+ "f1_zh_score",
+ "rouge_score",
+ "rouge_zh_score",
+ "retrieval_score",
+ "retrieval_zh_score",
+ "classification_score",
+ "code_sim_score",
+ "count_score",
+ "multi_choice_accuracy",
+ "math_equivalence",
+ "single_choice_accuracy",
+]
+
+
+class DatasetEvaluator(object):
+ """
+ Dataset evaluator.
+
+ """
+
+ def __init__(self, config_path: str, save_path: str):
+ self.config_path = config_path
+ self.save_path = save_path
+
+ def _calculate_label_metrics(self, metric: str, category: str):
+ """Calculate label-based metrics."""
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+
+ str_label_map = {
+ choice: idx for idx, choice in enumerate(self.data[category]["inference_kwargs"]["all_classes"])
+ }
+
+ references = [str_label_map[sample["target"]] for sample in self.data[category]["data"]]
+ [sample["output"] for sample in self.data[category]["data"]]
+
+ flag = False
+ softmaxs = []
+ for i, sample in enumerate(self.data[category]["data"]):
+ if np.any(np.isnan(np.array(list(sample["softmax_over_choices"].values())))):
+ if not flag:
+ print(
+ f"NaN in the softmax, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}."
+ )
+ flag = True
+ score = 0
+ for ref in sample["target"]:
+ score = max(
+ score,
+ metric_helper.single_choice_accuracy(
+ sample["output"], ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"]
+ ),
+ )
+
+ score = max(
+ score,
+ metric_helper.accuracy_by_options(sample["input"], sample["output"], ref),
+ )
+ softmaxs.append(references[i] if score == 1 else -1)
+ else:
+ softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values()))))
+
+ references = np.array(references)
+ softmaxs = np.array(softmaxs)
+ scores = np.sum(references == softmaxs) / len(self.data[category]["data"]) * 100
+
+ self.evaluation_results[metric][category] = (scores, len(self.data[category]["data"]))
+ self.evaluation_results[metric]["ALL"] += scores * weight
+
+ def _calculate_combined_metrics(self, metric: str, category: str):
+ """Calculate combined metrics."""
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+
+ references = [sample["target"] for sample in self.data[category]["data"]]
+ predictions = [sample["output"] for sample in self.data[category]["data"]]
+
+ str_label_map = {
+ choice: idx for idx, choice in enumerate(self.data[category]["inference_kwargs"]["all_classes"])
+ }
+
+ references_labels = [str_label_map[sample["target"][0]] for sample in self.data[category]["data"]]
+ predictions = [sample["output"] for sample in self.data[category]["data"]]
+
+ flag = False
+ softmaxs = []
+ for i, sample in enumerate(self.data[category]["data"]):
+ if np.any(np.isnan(np.array(list(sample["softmax_over_choices"].values())))):
+ if not flag:
+ print(
+ f"NaN in the softmax, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}."
+ )
+ flag = True
+ score = 0
+ for ref in sample["target"]:
+ score = max(
+ score,
+ metric_helper.single_choice_accuracy(
+ sample["output"], ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"]
+ ),
+ )
+ softmaxs.append(references[i] if score == 1 else -1)
+ else:
+ softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values()))))
+
+ metric_method = eval("metric_helper." + metric)
+
+ total_score = 0.0
+ for prediction, reference, references_label, softmax in zip(
+ predictions, references, references_labels, softmaxs
+ ):
+ score = 0.0
+
+ for ref in reference:
+ score = max(
+ score,
+ metric_method(prediction, ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"]),
+ )
+ if references_label == softmax:
+ score = 1
+
+ total_score += score
+ total_score = total_score * 100 / len(self.data[category]["data"])
+
+ self.evaluation_results[metric][category] = (total_score, len(self.data[category]["data"]))
+ self.evaluation_results[metric]["ALL"] += total_score * weight
+
+ def _calculate_other_metrics(self, metric: str, category: str):
+ """Calculate other metrics."""
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+
+ references = [sample["target"] for sample in self.data[category]["data"]]
+ predictions = [sample["output"] for sample in self.data[category]["data"]]
+
+ metric_method = eval("metric_helper." + metric)
+
+ total_score = 0.0
+ for prediction, reference in zip(predictions, references):
+ score = 0.0
+ for ref in reference:
+ score = max(
+ score,
+ metric_method(prediction, ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"]),
+ )
+ total_score += score
+ total_score = total_score * 100 / len(predictions)
+
+ self.evaluation_results[metric][category] = (total_score, len(self.data[category]["data"]))
+ self.evaluation_results[metric]["ALL"] += total_score * weight
+
+ def _calculate_gpt_metrics(self, metric: str, category: str):
+ """Calculate gpt metrics."""
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+
+ metric_method = eval("gpt_helper." + metric)
+
+ judgements, avg_ratings = metric_method(self.data[category]["data"], self.config_path)
+ self.judgements[category] = judgements
+
+ self.evaluation_results[metric][category] = (np.mean(avg_ratings), len(self.data[category]["data"]))
+ self.evaluation_results[metric]["ALL"] += np.mean(avg_ratings) * weight
+
+ for i in range(avg_ratings.shape[0]):
+ if f"{metric}_{i+1}" not in self.evaluation_results:
+ self.evaluation_results[f"{metric}_{i+1}"] = {cat: 0 for cat in (["ALL"] + self.categories)}
+ self.evaluation_results[f"{metric}_{i+1}"][category] = (avg_ratings[i], len(self.data[category]["data"]))
+ self.evaluation_results[f"{metric}_{i+1}"]["ALL"] += avg_ratings[i] * weight
+
+ def _calculate_loss_metrics(self, metric: str, category: str):
+ """Calculate perplexity."""
+ if metric == "perplexity":
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+ losses = [min(sample["loss"]) for sample in self.data[category]["data"]]
+ perplexity = np.mean(np.exp(np.array(losses)))
+
+ self.evaluation_results["perplexity"][category] = (perplexity, len(self.data[category]["data"]))
+ self.evaluation_results["perplexity"]["ALL"] += perplexity * weight
+ elif metric == "ppl_score":
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+ losses = [min(sample["loss"]) for sample in self.data[category]["data"]]
+ perplexity_score = np.mean(np.exp(-np.array(losses))) * 100
+
+ self.evaluation_results["ppl_score"][category] = (perplexity_score, len(self.data[category]["data"]))
+ self.evaluation_results["ppl_score"]["ALL"] += perplexity_score * weight
+ elif metric == "ppl_score_over_choices" and self.data[category]["inference_kwargs"]["all_classes"] is not None:
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+ loss_over_choices = [sample["loss_over_choices"] for sample in self.data[category]["data"]]
+ perplexity_score_over_choices = np.mean(np.exp(-np.array(loss_over_choices))) * 100
+
+ self.evaluation_results["ppl_score_over_choices"][category] = (
+ perplexity_score_over_choices,
+ len(self.data[category]["data"]),
+ )
+ self.evaluation_results["ppl_score_over_choices"]["ALL"] += perplexity_score_over_choices * weight
+ elif metric == "per_byte_perplexity":
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+ losses = [min(sample["loss_sum"]) for sample in self.data[category]["data"]]
+ perplexity = np.mean(np.exp(np.array(losses) / np.array(self.N_bytes[category])))
+
+ self.evaluation_results["per_byte_perplexity"][category] = perplexity
+ self.evaluation_results["per_byte_perplexity"]["ALL"] += perplexity * weight
+ elif metric == "per_byte_ppl_score":
+ weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+ losses = [min(sample["loss_sum"]) for sample in self.data[category]["data"]]
+ perplexity_score = np.mean(np.exp(-np.array(losses) / np.array(self.N_bytes[category]))) * 100
+
+ self.evaluation_results["per_byte_ppl_score"][category] = perplexity_score
+ self.evaluation_results["per_byte_ppl_score"]["ALL"] += perplexity_score * weight
+
+ def _evaluate(self):
+ """Calculate and return evaluation results"""
+
+ for metric in self.metrics:
+ pbar = tqdm.tqdm(
+ desc=f"{self.dataset_name}-{metric}-{self.model_name}", total=len(self.suggested_categories[metric])
+ )
+
+ if metric in LabelBasedMetrics:
+ for category in self.suggested_categories[metric]:
+ self._calculate_label_metrics(metric, category)
+ pbar.update(1)
+ elif metric in LossBasedMetrics:
+ for category in self.suggested_categories[metric]:
+ self._calculate_loss_metrics(metric, category)
+ pbar.update(1)
+ elif metric in CombinedMetrics:
+ for category in self.suggested_categories[metric]:
+ self._calculate_combined_metrics(metric, category)
+ pbar.update(1)
+ elif metric in GPTMetrics:
+ for category in self.suggested_categories[metric]:
+ self._calculate_gpt_metrics(metric, category)
+ pbar.update(1)
+ elif metric in OtherMetrics:
+ for category in self.suggested_categories[metric]:
+ self._calculate_other_metrics(metric, category)
+ pbar.update(1)
+ else:
+ raise Exception(f"{metric} not supported.")
+
+ if self.judgements:
+ judgement_path = os.path.join(self.save_path, f"{self.model_name}_judgements.json")
+ jdump(self.judgements, judgement_path)
+
+ return self.evaluation_results
+
+ def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name: str, metrics: List[str]):
+ """
+ Evaluate inference data on the given metrics.
+
+ Args:
+ data: Data to be evaluated.
+ dataset_name: Name of the dataset
+ model_name: Name of the model
+ metrics: Metrics used to evaluate.
+
+ """
+ self.data = data
+ self.dataset_name = dataset_name
+ self.model_name = model_name
+ self.categories = list(data.keys())
+ self.metrics = metrics
+ self.judgements = {}
+
+ self.evaluation_results = {
+ metric: {category: 0 for category in (["ALL"] + self.categories)} for metric in self.metrics
+ }
+
+ self.total_length = 0
+ self.total_single_choices = 0
+ for value in self.data.values():
+ self.total_length += len(value["data"])
+ if value["inference_kwargs"]["all_classes"] is not None:
+ self.total_single_choices += len(value["data"])
+
+ self.metric_total_length = {metric: 0 for metric in self.metrics}
+ self.suggested_categories = {metric: [] for metric in self.metrics}
+
+ for metric in self.metrics:
+ self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_name][metric]
+ if "ALL" in self.suggested_categories[metric]:
+ self.suggested_categories[metric] = self.categories
+ self.metric_total_length[metric] = self.total_length
+ continue
+ for category in self.suggested_categories[metric]:
+ self.metric_total_length[metric] += len(self.data[category]["data"])
+
+ if "per_byte_perplexity" in self.metrics or "per_byte_ppl_score" in self.metrics:
+ self.N_bytes = {category: [] for category in self.categories}
+ for category in self.categories:
+ samples = self.data[category]["data"]
+ for sample in samples:
+ self.N_bytes[category].append(sample["byte_num"][0])
+
+ return self._evaluate()
diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/gpt_judge.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/gpt_judge.py
new file mode 100644
index 000000000000..cd41dd7fdff0
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/gpt_judge.py
@@ -0,0 +1,151 @@
+# Code adapted from https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge
+
+import ast
+import concurrent.futures
+import copy
+import json
+import os
+import re
+import time
+from typing import Any, Dict, List
+
+import numpy as np
+import openai
+import tqdm
+
+MODEL = "gpt-4"
+
+API_MAX_RETRY = 16
+API_RETRY_SLEEP = 10
+API_ERROR_OUTPUT = "$ERROR$"
+
+NEED_REF_CATS = ["math", "reasoning", "coding"]
+
+one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
+one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
+
+
+def load_mt_prompts(prompt_file: str):
+ prompts = {}
+ with open(prompt_file) as fin:
+ for line in fin:
+ line = json.loads(line)
+ prompts[line["name"]] = line
+ return prompts
+
+
+def get_mt_prompt(prompts: Dict[str, str], multiturn: bool, math: bool):
+ if math and multiturn:
+ return prompts["single-math-v1-multi-turn"]
+ elif math and not multiturn:
+ return prompts["single-math-v1"]
+ elif not math and multiturn:
+ return prompts["single-v1-multi-turn"]
+ elif not math and not multiturn:
+ return prompts["single-v1"]
+
+
+def chat_compeletion_openai(messages: List[Dict], temperature: float = 0.0, max_tokens: int = 2048):
+ output = API_ERROR_OUTPUT
+ model = MODEL
+ for _ in range(API_MAX_RETRY):
+ try:
+ response = openai.ChatCompletion.create(
+ model=model,
+ messages=messages,
+ n=1,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ )
+ output = response["choices"][0]["message"]["content"]
+ break
+ except openai.error.OpenAIError as e:
+ print(type(e), e)
+ time.sleep(API_RETRY_SLEEP)
+
+ return output
+
+
+def get_mtbench_judgements(question: Dict[str, Any], prompts: Dict[str, str]):
+ id = question["id"]
+ judgement = {"id": id, "judgements": [], "ratings": []}
+ category = question["category"]
+ math = category in NEED_REF_CATS
+ turn_number = len(question["instruction"])
+
+ for num in range(turn_number):
+ assert (len(question["target"]) >= 1 and math) or not math
+ kwargs = {}
+ if num >= 1:
+ prompt = get_mt_prompt(prompts, multiturn=True, math=math)
+ if len(question["target"]) >= 1 and math:
+ kwargs = {f"ref_answer_{i+1}": question["target"][i] for i in range(len(question["target"]))}
+ user_prompt = prompt["prompt_template"].format(
+ question_1=question["instruction"][0],
+ question_2=question["instruction"][1],
+ answer_1=question["output"][0],
+ answer_2=question["output"][1],
+ **kwargs,
+ )
+ else:
+ prompt = get_mt_prompt(prompts, multiturn=False, math=math)
+ if len(question["target"]) >= 1 and math:
+ kwargs = {"ref_answer_1": question["target"][0]}
+ user_prompt = prompt["prompt_template"].format(
+ question=question["instruction"][0],
+ answer=question["output"][0],
+ **kwargs,
+ )
+
+ rating = -1
+ sys_prompt = prompt["system_prompt"]
+ messages = [{"role": "system", "content": sys_prompt}, {"role": "user", "content": user_prompt}]
+
+ judgement_str = chat_compeletion_openai(messages, temperature=0.0, max_tokens=2048)
+ match = re.search(one_score_pattern, judgement_str)
+ if not match:
+ match = re.search(one_score_pattern_backup, judgement_str)
+ if match:
+ rating = ast.literal_eval(match.groups()[0])
+ else:
+ rating = -1
+
+ judgement["judgements"].append(judgement_str)
+ judgement["ratings"].append(rating)
+
+ return judgement
+
+
+def mtbench_single_judge(data: List[Dict], config_path: str):
+ judgements = []
+
+ prompt_dir = os.path.dirname(config_path)
+ prompts = load_mt_prompts(os.path.join(prompt_dir, "mtbench_judge_prompts.jsonl"))
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+ futures = []
+ for i, question in enumerate(data):
+ future = executor.submit(get_mtbench_judgements, question, prompts)
+ futures.append(future)
+
+ for future in tqdm.tqdm(
+ concurrent.futures.as_completed(futures),
+ desc=f"MTBench single judge for {data[0]['category']}",
+ total=len(futures),
+ ):
+ judgements.append(future.result())
+
+ judgements.sort(key=lambda x: x["id"])
+
+ judgements_by_id = {j["id"]: j for j in judgements}
+
+ data_to_dump = copy.deepcopy(data)
+
+ for d in data_to_dump:
+ id = d["id"]
+ d["judgements"] = judgements_by_id[id]["judgements"]
+ d["ratings"] = judgements_by_id[id]["ratings"]
+
+ avg_ratings = np.mean([j["ratings"] for j in judgements], axis=0)
+
+ return data_to_dump, avg_ratings
diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py
new file mode 100644
index 000000000000..eae35bb9bb85
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py
@@ -0,0 +1,638 @@
+# Code adapted from https://github.com/THUDM/LongBench/blob/main/metrics.py
+# Code adapted from https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py
+# Code adapted from https://github.com/ruixiangcui/AGIEval/blob/main/src/evaluation.py
+
+import difflib
+import re
+import string
+from collections import Counter
+
+import jieba
+from fuzzywuzzy import fuzz
+from rouge import Rouge
+
+metrics4subcategory = {
+ "pretrain": {
+ "perplexity": ["ALL"],
+ "ppl_score": ["ALL"],
+ "per_byte_perplexity": ["ALL"],
+ "per_byte_ppl_score": ["ALL"],
+ },
+ # The commented are non 4-choice questions.
+ "agieval": {
+ "combined_single_choice_accuracy": [
+ # "lsat-ar",
+ # "lsat-lr",
+ # "lsat-rc",
+ "logiqa-en",
+ "sat-math",
+ "sat-en",
+ # "aqua-rat",
+ "sat-en-without-passage",
+ "gaokao-english",
+ "logiqa-zh",
+ "gaokao-chinese",
+ "gaokao-geography",
+ "gaokao-history",
+ "gaokao-biology",
+ "gaokao-chemistry",
+ ],
+ "first_token_accuracy": [
+ # "lsat-ar",
+ # "lsat-lr",
+ # "lsat-rc",
+ "logiqa-en",
+ "sat-math",
+ "sat-en",
+ # "aqua-rat",
+ "sat-en-without-passage",
+ "gaokao-english",
+ "logiqa-zh",
+ "gaokao-chinese",
+ "gaokao-geography",
+ "gaokao-history",
+ "gaokao-biology",
+ "gaokao-chemistry",
+ ],
+ "single_choice_accuracy": [
+ # "lsat-ar",
+ # "lsat-lr",
+ # "lsat-rc",
+ "logiqa-en",
+ "sat-math",
+ "sat-en",
+ # "aqua-rat",
+ "sat-en-without-passage",
+ "gaokao-english",
+ "logiqa-zh",
+ "gaokao-chinese",
+ "gaokao-geography",
+ "gaokao-history",
+ "gaokao-biology",
+ "gaokao-chemistry",
+ ],
+ "multi_choice_accuracy": ["jec-qa-kd", "jec-qa-ca", "gaokao-physics", "gaokao-mathqa"],
+ "math_equivalence": ["gaokao-mathcloze", "math"],
+ "perplexity": ["ALL"],
+ "ppl_score_over_choices": [
+ "lsat-ar",
+ "lsat-lr",
+ "lsat-rc",
+ "logiqa-en",
+ "sat-math",
+ "sat-en",
+ "aqua-rat",
+ "sat-en-without-passage",
+ "gaokao-english",
+ "logiqa-zh",
+ "jec-qa-kd",
+ "jec-qa-ca",
+ "gaokao-chinese",
+ "gaokao-geography",
+ "gaokao-history",
+ "gaokao-biology",
+ "gaokao-chemistry",
+ "gaokao-physics",
+ "gaokao-mathqa",
+ ],
+ "ppl_score": ["ALL"],
+ },
+ "cmmlu": {
+ "first_token_accuracy": ["ALL"],
+ "single_choice_accuracy": ["ALL"],
+ "perplexity": ["ALL"],
+ "ppl_score_over_choices": ["ALL"],
+ "ppl_score": ["ALL"],
+ },
+ "gaokaobench": {
+ "combined_single_choice_accuracy": [
+ "English MCQs",
+ "Biology MCQs",
+ "Chemistry MCQs",
+ "History MCQs",
+ "Math I MCQs",
+ "Math II MCQs",
+ "Political Science MCQs",
+ ],
+ "first_token_accuracy": [
+ "English MCQs",
+ "Biology MCQs",
+ "Chemistry MCQs",
+ "History MCQs",
+ "Math I MCQs",
+ "Math II MCQs",
+ "Political Science MCQs",
+ ],
+ "single_choice_accuracy": [
+ "English MCQs",
+ "Biology MCQs",
+ "Chemistry MCQs",
+ "History MCQs",
+ "Math I MCQs",
+ "Math II MCQs",
+ "Political Science MCQs",
+ ],
+ "multi_choice_accuracy": [
+ "Chinese Lang and Usage MCQs",
+ "Chinese Modern Lit",
+ "English Fill in Blanks",
+ "English Reading Comp",
+ "Geography MCQs",
+ "Physics MCQs",
+ "English Cloze Test",
+ ],
+ "math_equivalence": ["Math I Fill-in-the-Blank", "Math II Fill-in-the-Blank"],
+ "rouge_score": ["English Language Cloze Passage"],
+ "rouge_zh_score": [
+ "Chinese Language Famous Passages and Sentences Dictation",
+ "Chemistry Open-ended Questions",
+ "History Open-ended Questions",
+ "Biology Open-ended Questions",
+ "Political Science Open-ended Questions",
+ "English Language Error Correction",
+ "Chinese Language Language and Writing Skills Open-ended Questions",
+ "Math II Open-ended Questions",
+ "Chinese Language Literary Text Reading",
+ "Chinese Language Ancient Poetry Reading",
+ "Chinese Language Classical Chinese Reading",
+ "Physics Open-ended Questions",
+ "Math I Open-ended Questions",
+ "Geography Open-ended Questions",
+ "Chinese Language Practical Text Reading",
+ ],
+ "perplexity": ["ALL"],
+ "ppl_score_over_choices": ["ALL"],
+ "ppl_score": ["ALL"],
+ },
+ "longbench": {
+ "f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"],
+ "f1_zh_score": ["multifieldqa_zh"],
+ "rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"],
+ "rouge_zh_score": ["dureader", "vcsum"],
+ "retrieval_score": ["passage_retrieval_en"],
+ "retrieval_zh_score": ["passage_retrieval_zh"],
+ "classification_score": ["trec", "lsht"],
+ "code_sim_score": ["lcc", "repobench-p"],
+ "count_score": ["passage_count"],
+ "perplexity": ["ALL"],
+ "ppl_score": ["ALL"],
+ },
+ "mmlu": {
+ "first_token_accuracy": ["ALL"],
+ "single_choice_accuracy": ["ALL"],
+ "accuracy": ["ALL"],
+ "perplexity": ["ALL"],
+ "ppl_score_over_choices": ["ALL"],
+ "ppl_score": ["ALL"],
+ },
+ "mtbench": {"mtbench_single_judge": ["ALL"]},
+}
+
+
+def _fix_fracs(string):
+ substrs = string.split("\\frac")
+ new_str = substrs[0]
+ if len(substrs) > 1:
+ substrs = substrs[1:]
+ for substr in substrs:
+ new_str += "\\frac"
+ if substr[0] == "{":
+ new_str += substr
+ else:
+ try:
+ assert len(substr) >= 2
+ except:
+ return string
+ a = substr[0]
+ b = substr[1]
+ if b != "{":
+ if len(substr) > 2:
+ post_substr = substr[2:]
+ new_str += "{" + a + "}{" + b + "}" + post_substr
+ else:
+ new_str += "{" + a + "}{" + b + "}"
+ else:
+ if len(substr) > 2:
+ post_substr = substr[2:]
+ new_str += "{" + a + "}" + b + post_substr
+ else:
+ new_str += "{" + a + "}" + b
+ string = new_str
+ return string
+
+
+def _fix_a_slash_b(string):
+ if len(string.split("/")) != 2:
+ return string
+ a = string.split("/")[0]
+ b = string.split("/")[1]
+ try:
+ a = int(a)
+ b = int(b)
+ assert string == "{}/{}".format(a, b)
+ new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+ return new_string
+ except:
+ return string
+
+
+def _remove_right_units(string):
+ # "\\text{ " only ever occurs (at least in the val set) when describing units
+ if "\\text{ " in string:
+ splits = string.split("\\text{ ")
+ assert len(splits) == 2
+ return splits[0]
+ else:
+ return string
+
+
+def _fix_sqrt(string):
+ if "\\sqrt" not in string:
+ return string
+ splits = string.split("\\sqrt")
+ new_string = splits[0]
+ for split in splits[1:]:
+ if split[0] != "{":
+ a = split[0]
+ new_substr = "\\sqrt{" + a + "}" + split[1:]
+ else:
+ new_substr = "\\sqrt" + split
+ new_string += new_substr
+ return new_string
+
+
+def _strip_string(string):
+ # linebreaks
+ string = string.replace("\n", "")
+ # print(string)
+
+ # remove inverse spaces
+ string = string.replace("\\!", "")
+ # print(string)
+
+ # replace \\ with \
+ string = string.replace("\\\\", "\\")
+ # print(string)
+
+ # replace tfrac and dfrac with frac
+ string = string.replace("tfrac", "frac")
+ string = string.replace("dfrac", "frac")
+ # print(string)
+
+ # remove \left and \right
+ string = string.replace("\\left", "")
+ string = string.replace("\\right", "")
+ # print(string)
+
+ # Remove circ (degrees)
+ string = string.replace("^{\\circ}", "")
+ string = string.replace("^\\circ", "")
+
+ # remove dollar signs
+ string = string.replace("\\$", "")
+
+ # remove units (on the right)
+ string = _remove_right_units(string)
+
+ # remove percentage
+ string = string.replace("\\%", "")
+ string = string.replace("\%", "")
+
+ # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+ string = string.replace(" .", " 0.")
+ string = string.replace("{.", "{0.")
+ # if empty, return empty string
+ if len(string) == 0:
+ return string
+ if string[0] == ".":
+ string = "0" + string
+
+ # to consider: get rid of e.g. "k = " or "q = " at beginning
+ if len(string.split("=")) == 2:
+ if len(string.split("=")[0]) <= 2:
+ string = string.split("=")[1]
+
+ # fix sqrt3 --> sqrt{3}
+ string = _fix_sqrt(string)
+
+ # remove spaces
+ string = string.replace(" ", "")
+
+ # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+ string = _fix_fracs(string)
+
+ # manually change 0.5 --> \frac{1}{2}
+ if string == "0.5":
+ string = "\\frac{1}{2}"
+
+ # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+ string = _fix_a_slash_b(string)
+
+ return string
+
+
+def parse_math_answer(raw_string):
+ def remove_boxed(s):
+ left = "\\boxed{"
+ try:
+ assert s[: len(left)] == left
+ assert s[-1] == "}"
+ answer = s[len(left) : -1]
+ if "=" in answer:
+ answer = answer.split("=")[-1].lstrip(" ")
+ return answer
+ except:
+ return None
+
+ def last_boxed_only_string(string):
+ idx = string.rfind("\\boxed")
+ if idx < 0:
+ idx = string.rfind("\\fbox")
+ if idx < 0:
+ return None
+ i = idx
+ right_brace_idx = None
+ num_left_braces_open = 0
+ while i < len(string):
+ if string[i] == "{":
+ num_left_braces_open += 1
+ if string[i] == "}":
+ num_left_braces_open -= 1
+ if num_left_braces_open == 0:
+ right_brace_idx = i
+ break
+ i += 1
+
+ if right_brace_idx == None:
+ retval = None
+ else:
+ retval = string[idx : right_brace_idx + 1]
+
+ return retval
+
+ def get_answer_with_dollar_sign(s):
+ first_pattern = "\$(.*)\$"
+ last_match = None
+ matches = re.findall(first_pattern, s)
+ if matches:
+ last_match = matches[-1]
+ if "=" in last_match:
+ last_match = last_match.split("=")[-1].lstrip(" ")
+ return last_match
+
+ def get_answer_without_dollar_sign(s):
+ last_match = None
+ if "=" in s:
+ last_match = s.split("=")[-1].lstrip(" ").rstrip(".")
+ if "\\n" in last_match:
+ last_match = last_match.split("\\n")[0]
+ else:
+ pattern = "(?:\\$)?\d+(?:\.\d+)?(?![\w\d])"
+ matches = re.findall(pattern, s)
+ if matches:
+ last_match = matches[-1]
+ return last_match
+
+ if "\\boxed" in raw_string:
+ answer = remove_boxed(last_boxed_only_string(raw_string))
+ else:
+ answer = get_answer_with_dollar_sign(raw_string)
+ if not answer:
+ answer = get_answer_without_dollar_sign(raw_string)
+ return answer
+
+
+def math_equivalence(prediction, reference, **kwargs):
+ prediction = parse_math_answer(prediction)
+
+ if prediction is None and reference is None:
+ print("WARNING: Both None")
+ return False
+
+ if prediction is None or reference is None:
+ return False
+
+ try:
+ ss1 = _strip_string(prediction)
+ ss2 = _strip_string(reference)
+ return ss1 == ss2
+ except:
+ return prediction == reference
+
+
+def multi_choice_accuracy(prediction, reference, **kwargs):
+ # Only find uppercase letters not surrounded by lowercase letters
+ all_classes = kwargs.get("all_classes", None)
+ if all_classes:
+ pattern = f"(? highest_similarity:
+ highest_similarity = similarity
+ best_match = string
+ score = float(best_match == reference)
+ return score
+
+
+def rouge_score(prediction, reference, **kwargs):
+ rouge = Rouge()
+ try:
+ scores = rouge.get_scores([prediction], [reference], avg=True)
+ except:
+ return 0.0
+ return scores["rouge-l"]["f"]
+
+
+def rouge_zh_score(prediction, reference, **kwargs):
+ prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
+ reference = " ".join(list(jieba.cut(reference, cut_all=False)))
+ score = rouge_score(prediction, reference)
+ return score
+
+
+def _f1_score(prediction, reference, **kwargs):
+ common = Counter(prediction) & Counter(reference)
+ num_same = sum(common.values())
+ if num_same == 0:
+ return 0
+ precision = 1.0 * num_same / len(prediction)
+ recall = 1.0 * num_same / len(reference)
+ f1 = (2 * precision * recall) / (precision + recall)
+ return f1
+
+
+def f1_score(prediction, reference, **kwargs):
+ normalized_prediction = normalize_answer(prediction)
+ normalized_ground_truth = normalize_answer(reference)
+
+ prediction_tokens = normalized_prediction.split()
+ ground_truth_tokens = normalized_ground_truth.split()
+ return _f1_score(prediction_tokens, ground_truth_tokens)
+
+
+def f1_zh_score(prediction, reference, **kwargs):
+ prediction_tokens = list(jieba.cut(prediction, cut_all=False))
+ ground_truth_tokens = list(jieba.cut(reference, cut_all=False))
+ prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
+ ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
+ prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
+ ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
+ return _f1_score(prediction_tokens, ground_truth_tokens)
diff --git a/applications/ColossalEval/colossal_eval/evaluate/evaluator.py b/applications/ColossalEval/colossal_eval/evaluate/evaluator.py
new file mode 100644
index 000000000000..11e204b504c5
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/evaluate/evaluator.py
@@ -0,0 +1,110 @@
+import os
+from typing import Any, Dict, List
+
+import colossal_eval.evaluate.gpt_evaluate as gpt_evaluate
+
+from .utils import get_data_per_category
+
+
+class Evaluator(object):
+ """
+ A class named Evaluator includes GPT-3.5/GPT-4 evaluation
+
+ """
+
+ def __init__(
+ self,
+ params: Dict[str, Any],
+ battle_prompt: Dict[str, Any],
+ gpt_evaluation_prompt: Dict[str, Any],
+ gpt_model: str,
+ language: str,
+ gpt_with_reference: bool,
+ ) -> None:
+ self.params = params
+ self.battle_prompt = battle_prompt
+ self.gpt_evaluation_prompt = gpt_evaluation_prompt
+ self.gpt_model = gpt_model
+ self.language = language
+ self.gpt_with_reference = gpt_with_reference
+ self.gpt_evaluation_results = dict()
+ self.battle_results = []
+
+ def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
+ """
+ Comparison between two models using GPT-4 as the reviewer.
+ """
+
+ self.battle_results = gpt_evaluate.battle(answers1, answers2, self.battle_prompt)
+
+ def evaluate(self, answers: List[Dict], targets: List[Dict], save_path: str, model_name: str) -> None:
+ """
+ A comprehensive evaluation of the answers from the model.
+ The function evaluates the model's performance from different perspectives
+ using GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
+
+ The metrics will be decided by the config file.
+
+ """
+
+ answers_per_category = get_data_per_category(answers, list(self.params.keys()))
+ targets_per_category = get_data_per_category(targets, list(self.params.keys()))
+
+ # gpt evaluation
+ for category in self.params:
+ if len(answers_per_category[category]) == 0:
+ print(f"Category {category} specified in your config doesn't have corresponding answers!")
+ continue
+
+ if self.params[category].get("GPT", None) is None:
+ continue
+
+ category_metrics = self.params[category]["GPT"]
+
+ prompt = self.gpt_evaluation_prompt.get(category, None)
+ if prompt is None:
+ print(f"No prompt for category {category}! Use prompt for category general now.")
+ prompt = self.gpt_evaluation_prompt["general"]
+
+ self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(
+ answers_per_category[category],
+ prompt,
+ category_metrics,
+ category,
+ save_path,
+ model_name,
+ self.gpt_model,
+ self.language,
+ references=targets_per_category[category] if self.gpt_with_reference else None,
+ )
+
+ def save(self, path: str, model_name_list: List[str]) -> None:
+ """
+ Save evaluation results of GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
+
+ """
+
+ if len(model_name_list) == 2:
+ save_path = os.path.join(path, "gpt_evaluate", "battle_results")
+ gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
+ else:
+ if self.gpt_evaluation_results:
+ # Save evaluation results for GPT evaluation metrics.
+ gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
+ gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
+
+ all_evaluations = gpt_evaluate.save_gpt_evaluation_results(
+ model_name_list[0], self.gpt_evaluation_results, gpt_evaluation_results_save_path
+ )
+
+ # Start to calculate scores and save statistics.
+ gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
+ gpt_evaluate.save_gpt_evaluation_statistics(
+ model_name_list[0], all_evaluations, gpt_evaluation_statistics_save_path
+ )
+
+ # Save charts and csv.
+ gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
+ gpt_evaluate.analyze_gpt_evaluation_statistics(
+ gpt_evaluation_statistics_save_path, gpt_evaluation_analyses_save_path
+ )
diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/ColossalEval/colossal_eval/evaluate/gpt_evaluate.py
similarity index 80%
rename from applications/Chat/evaluate/gpt_evaluate.py
rename to applications/ColossalEval/colossal_eval/evaluate/gpt_evaluate.py
index 6fcbe63d0253..a0b1ed1143f0 100644
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/ColossalEval/colossal_eval/evaluate/gpt_evaluate.py
@@ -11,23 +11,21 @@
import pandas as pd
import seaborn as sns
import tqdm
-from utils import jdump, jload
+from colossal_eval.utils import jdump, jload
ref_step_template = {
- "en":
- "Now please compare the answer with the {adjective} answer, determine whether the answer is able to achieve the same level of {metric}.\n\n",
- "cn":
- "请比较答案与上面的{adjective}答案,确定答案是否可以达到与该{adjective}答案同样水平的{metric}。\n\n"
+ "en": "Now please compare the answer with the {adjective} answer, determine whether the answer is able to achieve the same level of {metric}.\n\n",
+ "cn": "请比较答案与上面的{adjective}答案,确定答案是否可以达到与该{adjective}答案同样水平的{metric}。\n\n",
}
ref_answer_template_general = {
"en": "\nAn example answer with good quality is as follows:\n\n{answer}\n\n",
- "cn": "\n一个优质的示例答案如下:\n\n{answer}\n\n"
+ "cn": "\n一个优质的示例答案如下:\n\n{answer}\n\n",
}
ref_answer_template_correctness = {
"en": "\nA correct answer is as follows:\n\n{answer}\n\n",
- "cn": "\n标准答案如下:\n\n{answer}\n\n"
+ "cn": "\n标准答案如下:\n\n{answer}\n\n",
}
@@ -51,10 +49,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
- {
- "role": "system",
- "content": sys_prompt
- },
+ {"role": "system", "content": sys_prompt},
{
"role": "user",
"content": user_prompt,
@@ -106,7 +101,7 @@ def parse_battle_score(evaluation: str) -> List[float]:
return [float(sp[0]), float(sp[1])]
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
- except Exception as e:
+ except Exception:
return [-1, -1]
@@ -125,9 +120,6 @@ def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]
assert len(answer1) == len(answer2)
- handles = []
- evaluation_file = []
-
total_len = len(answer1)
question_idx_list = list(range(total_len))
@@ -140,9 +132,12 @@ def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]
assert answer1[i]["id"] == answer2[i]["id"]
answer_id = answer1[i]["id"]
- ques = answer1[i]["instruction"] if answer1[i][
- "input"] == "" else answer1[i]["instruction"] + " " + answer1[i]["input"]
- cat = answer1[i]["category"]
+ ques = (
+ answer1[i]["instruction"]
+ if answer1[i]["input"] == ""
+ else answer1[i]["instruction"] + " " + answer1[i]["input"]
+ )
+ answer1[i]["category"]
ans1 = answer1[i]["output"]
ans2 = answer2[i]["output"]
@@ -267,7 +262,11 @@ def reference_template(metric: str, language: str, reference: Dict[str, Any]) ->
step_to_add = ref_step_template[language]
- for_the_given_answer = "{metric} (1-5) (directly give the score for the given answer):" if language == "en" else "{metric} (1-5) (直接对给定答案打分)"
+ for_the_given_answer = (
+ "{metric} (1-5) (directly give the score for the given answer):"
+ if language == "en"
+ else "{metric} (1-5) (直接对给定答案打分)"
+ )
# adjective is used to describe the word "answer" in the prompt.
adjective = "example" if language == "en" else "示例"
@@ -280,8 +279,9 @@ def reference_template(metric: str, language: str, reference: Dict[str, Any]) ->
answer_to_add = ref_answer_template_correctness[language]
answer_to_add = answer_to_add.format(answer=reference["target"] if reference["target"] else reference["output"])
- step_to_add = step_to_add.format(metric=metric.lower(),
- adjective=adjective) + for_the_given_answer.format(metric=metric)
+ step_to_add = step_to_add.format(metric=metric.lower(), adjective=adjective) + for_the_given_answer.format(
+ metric=metric
+ )
return answer_to_add + step_to_add
@@ -329,7 +329,8 @@ def multiturn_chat_completion(user_messages: List[str], model: str, max_tokens:
for j in range(i):
messages_to_send.append(fill_in_message("user", user_messages[j]))
messages_to_send.append(
- fill_in_message("assistant", assistant_responses[j]["choices"][0]["message"]["content"]))
+ fill_in_message("assistant", assistant_responses[j]["choices"][0]["message"]["content"])
+ )
# Length of user messages == Length of assistant messages + 1
# Because we always expect the api to response
@@ -351,17 +352,19 @@ def multiturn_chat_completion(user_messages: List[str], model: str, max_tokens:
return assistant_responses[-1]
-def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
- inst: Dict[str, Any],
- metrics: List[str],
- language: str,
- reference: Dict[str, Any] = None,
- model: str = "gpt-3.5-turbo",
- max_tokens: int = 2048) -> Dict[str, Any]:
+def get_gpt_evaluation_without_logprobs(
+ prompt: Dict[str, Any],
+ inst: Dict[str, Any],
+ metrics: List[str],
+ language: str,
+ reference: Dict[str, Any] = None,
+ model: str = "gpt-3.5-turbo",
+ max_tokens: int = 2048,
+) -> Dict[str, Any]:
"""
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
- Temperature is set to 0 to make the model more deterministic.
+ Temprature is set to 0 to make the model more deterministic.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
@@ -378,7 +381,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
MAX_API_RETRY = 3
- question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"])
+ question = inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"]
answer = inst["output"]
inst["evaluation"] = {}
@@ -398,12 +401,11 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
steps=prompt["CoT"][metric],
)
- if prompt_reference:
+ if prompt_reference and (reference["target"] or reference["output"]):
# Do a 2-round conversation
- response = multiturn_chat_completion([prompt_1st_round, prompt_reference],
- model,
- max_tokens=max_tokens,
- turns=2)
+ response = multiturn_chat_completion(
+ [prompt_1st_round, prompt_reference], model, max_tokens=max_tokens, turns=2
+ )
else:
response = multiturn_chat_completion([prompt_1st_round], model, max_tokens=max_tokens, turns=1)
@@ -427,15 +429,14 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
return inst
-def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
- inst: Dict[str, Any],
- metrics: List[str],
- max_tokens: int = 2048) -> Dict[str, Any]:
+def get_gpt_evaluation_with_logprobs(
+ prompt: Dict[str, Any], inst: Dict[str, Any], metrics: List[str], max_tokens: int = 2048
+) -> Dict[str, Any]:
"""
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.
- Temperature is set to 0 to make the model more deterministic.
+ Temprature is set to 0 to make the model more deterministic.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
@@ -449,7 +450,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
MAX_API_RETRY = 3
- question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"])
+ question = inst["instruction"] if inst["input"] == "" else inst["instruction"] + "\n" + inst["input"]
answer = inst["output"]
inst["evaluation"] = {}
@@ -492,13 +493,17 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
return inst
-def evaluate(answers: List[Dict],
- prompt: Dict[str, Any],
- metrics: List[str],
- category: str,
- model: str,
- language: str,
- references: List[Dict] = None) -> List[Dict]:
+def evaluate(
+ answers: List[Dict],
+ prompt: Dict[str, Any],
+ metrics: List[str],
+ category: str,
+ save_path: str,
+ model_name: str,
+ model: str,
+ language: str,
+ references: List[Dict] = None,
+) -> List[Dict]:
"""
Use GPT models to evaluate model answers and save evaluation results.
@@ -522,6 +527,72 @@ def evaluate(answers: List[Dict],
metrics_str = ", ".join(x for x in metrics)
print(f"Category {category}'s metrics are {metrics_str}.")
+ gpt_base_save_path = os.path.join(save_path, "gpt_evaluate", "gpt_evaluate_results")
+ gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
+ category_file = os.path.join(gpt_evaluation_results_save_path, model_name, f"{category}_evaluation_results.json")
+
+ if os.path.exists(category_file):
+ print(f"Evaluation results for category {category}, model {model_name} already exists.")
+ print("Skip evaluating.")
+
+ evaluations = jload(category_file)
+
+ retry = []
+ evaluations_copy = deepcopy(evaluations)
+
+ success = []
+ for idx, e in enumerate(evaluations_copy):
+ keys = list(e["evaluation"].keys())
+ for key in keys:
+ if e["evaluation"][key] == {}:
+ retry.append(e["id"])
+ print(f"Re-evaluate id {e['id']} now.")
+ break
+ if e["id"] not in retry:
+ success.append(e)
+
+ if len(retry) == 0:
+ evaluations.sort(key=lambda x: x["id"])
+ print(f"{category} done.")
+ return evaluations
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+ futures = []
+ for idx, inst in enumerate(answers):
+ if not inst["id"] in retry:
+ continue
+ # Completion models can return log probabilities.
+ if model == "text-davinci-003":
+ future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
+ else:
+ future = executor.submit(
+ get_gpt_evaluation_without_logprobs,
+ prompt,
+ inst,
+ metrics,
+ language,
+ reference=None if references is None else references[idx],
+ model=model,
+ max_tokens=1,
+ )
+
+ futures.append(future)
+
+ for future in tqdm.tqdm(
+ concurrent.futures.as_completed(futures),
+ desc=f"{category}: ",
+ total=len(futures),
+ ):
+ success.append(future.result())
+
+ success.sort(key=lambda x: x["id"])
+
+ print(f"Saving evaluation results for category {category}, model {model_name}.")
+
+ jdump(success, category_file)
+
+ return success
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for idx, inst in enumerate(answers):
@@ -529,21 +600,23 @@ def evaluate(answers: List[Dict],
if model == "text-davinci-003":
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
else:
- future = executor.submit(get_gpt_evaluation_without_logprobs,
- prompt,
- inst,
- metrics,
- language,
- reference=None if references is None else references[idx],
- model=model,
- max_tokens=1)
+ future = executor.submit(
+ get_gpt_evaluation_without_logprobs,
+ prompt,
+ inst,
+ metrics,
+ language,
+ reference=None if references is None else references[idx],
+ model=model,
+ max_tokens=1,
+ )
futures.append(future)
for future in tqdm.tqdm(
- concurrent.futures.as_completed(futures),
- desc=f"{category}: ",
- total=len(futures),
+ concurrent.futures.as_completed(futures),
+ desc=f"{category}: ",
+ total=len(futures),
):
evaluations.append(future.result())
@@ -551,6 +624,10 @@ def evaluate(answers: List[Dict],
print(f"{category} done.")
+ print(f"Saving evaluation results for category {category}, model {model_name}.")
+
+ jdump(evaluations, category_file)
+
return evaluations
@@ -576,7 +653,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
for key, value in logprobs.items():
# Sometimes the key will be one byte of a unicode character which takes the form of "bytes:\\xe7".
- # It is meaningless, and thus we don't calculate probability.
+ # It is meaningless and thus we don't calculate probability.
if "bytes" in key:
continue
# results[0] is the score which corresponds to the key(predicted token).
@@ -593,7 +670,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
- Different from text-davinci-003, this function directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
+ Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Args:
@@ -610,18 +687,19 @@ def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) ->
return int(results[0])
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
- except Exception as e:
+ except Exception:
return 0
-def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
- save_path: str) -> Dict[str, Any]:
+def save_gpt_evaluation_results(
+ model_name: str, gpt_evaluation_results: Dict[str, Any], save_path: str
+) -> Dict[str, Any]:
"""
Save evaluation results for different categories for one model.
Args:
model_name: name of the model for saving evaluation results.
- gpt_evaluation_results: evaluations results for all the model answers.
+ gpt_evaluation_results: evaluations results for all of the model answers.
save_path: path to save GPT evaluation statistics.
"""
@@ -641,7 +719,7 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav
Args:
model_name: name of the model for saving statistics.
- evaluations: evaluations for all the model answers.
+ evaluations: evaluations for all of the model answers.
save_path: path to save GPT evaluation statistics.
"""
@@ -663,14 +741,16 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav
for evaluation in data:
for metric in metrics:
if evaluation["evaluation"][metric] == {}:
- # This means after 3 retries, the server still returns an error, and we set the score to 0.
+ # This means after 3 retries, the server still returns an error and we set the score to 0.
scores[metric].append(0)
elif evaluation["evaluation"][metric]["logprobs"] is not None:
scores[metric].append(
- calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+ calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0])
+ )
else:
scores[metric].append(
- calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
+ calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation)
+ )
statistics = {}
for metric in metrics:
@@ -751,9 +831,9 @@ def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> N
frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
for category in tqdm.tqdm(
- frame_per_category.keys(),
- desc=f"GPT evaluation: ",
- total=len(frame_per_category.keys()),
+ frame_per_category.keys(),
+ desc=f"GPT evaluation: ",
+ total=len(frame_per_category.keys()),
):
data = pd.DataFrame(frame_per_category[category])
diff --git a/applications/ColossalEval/colossal_eval/evaluate/utils.py b/applications/ColossalEval/colossal_eval/evaluate/utils.py
new file mode 100644
index 000000000000..69fec46705ab
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/evaluate/utils.py
@@ -0,0 +1,8 @@
+def get_data_per_category(data, categories):
+ data_per_category = {category: [] for category in categories}
+ for item in data:
+ category = item["category"]
+ if category in categories:
+ data_per_category[category].append(item)
+
+ return data_per_category
diff --git a/applications/ColossalEval/colossal_eval/models/__init__.py b/applications/ColossalEval/colossal_eval/models/__init__.py
new file mode 100644
index 000000000000..8f6c9b414145
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/models/__init__.py
@@ -0,0 +1,5 @@
+from .base import BaseModel
+from .chatglm import ChatGLM2Model, ChatGLMModel
+from .huggingface import HuggingFaceCausalLM, HuggingFaceModel
+
+__all__ = ["BaseModel", "HuggingFaceModel", "HuggingFaceCausalLM", "ChatGLMModel", "ChatGLM2Model"]
diff --git a/applications/ColossalEval/colossal_eval/models/base.py b/applications/ColossalEval/colossal_eval/models/base.py
new file mode 100644
index 000000000000..aae796c1d56e
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/models/base.py
@@ -0,0 +1,78 @@
+from abc import abstractclassmethod
+from typing import Dict, List
+
+from colossal_eval.utils import Conversation, prompt_templates
+
+from colossalai.logging import DistributedLogger
+
+
+class BaseModel:
+ """
+ Base class for model wrapper.
+
+ Args:
+ path: The path to the model.
+ model_max_length: The maximum sequence length of the model.
+ prompt_template: The model's prompt template.
+ batch_size: Batch size for inference.
+ logger: Logger for the model.
+ """
+
+ def __init__(
+ self,
+ path: str,
+ model_max_length: int = 2048,
+ prompt_template: Conversation = None,
+ batch_size: int = 1,
+ logger: DistributedLogger = None,
+ ):
+ self.path = path
+ self.model_max_length = model_max_length
+
+ if prompt_template:
+ self.prompt_template = prompt_template
+ else:
+ self.prompt_template = prompt_templates["plain"]
+
+ self.batch_size = batch_size
+ self.logger = logger
+
+ @abstractclassmethod
+ def inference(self, data: List[Dict]) -> None:
+ """
+ Infer the given data.
+ This function will call self.generate() to get model outputs and also self.model(input) to get logits.
+
+ Args:
+ data: The data for inference.
+ """
+
+ @abstractclassmethod
+ def generate(self, inputs: List[str], max_new_tokens: int) -> List[str]:
+ """
+ Generate results given a list of inputs.
+
+ Args:
+ inputs: A list of strings.
+ max_new_tokens: The maximum length of the output.
+
+ Returns:
+ A list of generated strings.
+ """
+
+ @abstractclassmethod
+ def get_loss(self, batch: List[str], batch_target: List[str]) -> List[float]:
+ """
+ Get loss given batch and batch with target.
+ Use their length difference after tokenization to mask the loss and only compute loss at target tokens.
+
+ Args:
+ batch: batch prompt without target answer.
+ batch_target: batch prompt with target answer.
+
+ Returns:
+ A list of loss.
+ """
+
+ def to(self, device):
+ self.model.to(device)
diff --git a/applications/ColossalEval/colossal_eval/models/chatglm.py b/applications/ColossalEval/colossal_eval/models/chatglm.py
new file mode 100644
index 000000000000..f293c4f699cd
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/models/chatglm.py
@@ -0,0 +1,303 @@
+import copy
+from typing import List
+
+import torch
+
+from .huggingface import HuggingFaceModel
+
+IGNORE_INDEX = -100
+
+
+class ChatGLMModel(HuggingFaceModel):
+ def _get_truncated_prompts(self, inputs: List[str], max_new_tokens: int) -> List[str]:
+ truncated_inputs = copy.deepcopy(inputs)
+ # Adapted from https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py#L187
+ for i, input in enumerate(inputs):
+ a_ids = self.tokenizer.encode(text=input, truncation=False, add_special_tokens=False)
+
+ if len(a_ids) > self.model_max_length - max_new_tokens:
+ half = (self.model_max_length - max_new_tokens) // 2
+ prompt = self.tokenizer.decode(a_ids[:half], skip_special_tokens=True) + self.tokenizer.decode(
+ a_ids[-half:], skip_special_tokens=True
+ )
+ truncated_inputs[i] = prompt
+
+ return truncated_inputs
+
+ @torch.no_grad()
+ def get_loss(
+ self, batch_prompt: List[str], batch_target: List[List[str]], pretrain: bool = False
+ ) -> List[List[float]]:
+ """
+ Calculate loss only on target tokens.
+
+ Args:
+ batch: A batch of prompt without target answer.
+ batch_target: A batch of target answer. Sometimes one question can have multiple target answers.
+
+ Returns:
+ Loss.
+
+ """
+
+ # We set max_new_tokens in self._get_truncated_prompts to 0 because we only need logits to calculate loss.
+ # We don't need to generate new tokens.
+ # Target answer's length is usually << model_max_length, but we still call it in case.
+ # We don't call self._get_truncated_prompts for batch_prompt because we need target answer's length first to reserve some space for target answer's tokens.
+ batch_target = [self._get_truncated_prompts(prompt_target, 0) for prompt_target in batch_target]
+
+ # Get the number of target answers for different questions
+ batch_target_nums = [len(prompt_target) for prompt_target in batch_target]
+
+ labels_list = []
+ input_ids_list = []
+
+ for input, targets in zip(batch_prompt, batch_target):
+ for target in targets:
+ # Adapted from https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py#L187
+ # If there is no history, the prompt is just the query.
+ # We don't need to override self.generate() in ChatGLM-6B but need to override it in ChatGLM2-6B.
+ # See https://huggingface.co/THUDM/chatglm-6b/blob/main/modeling_chatglm.py#L1276
+ target_tokenized = self.tokenizer.encode(text=target, add_special_tokens=False)
+
+ # Get prompt with length model_max_length - len(target_tokenized).
+ # Reserve some space for target answer tokens using max_new_tokens.
+ # This will generate the correct start_idx and end_idx.
+ max_new_tokens = len(target_tokenized)
+
+ # Here 3 tokens are reserved for [gmask_id, bos_token, eos_id]. So we reserve max_new_tokens + 3 tokens.
+ # See https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L323
+ prompt_with_correct_length = self._get_truncated_prompts([input], max_new_tokens + 3)[0]
+ input_tokenized = self.tokenizer.encode(prompt_with_correct_length, add_special_tokens=False)
+
+ input_ids = self.tokenizer.build_inputs_with_special_tokens(input_tokenized, target_tokenized)
+
+ context_length = input_ids.index(self.tokenizer.bos_token_id)
+ context_length - 1
+
+ target_ids = [IGNORE_INDEX] * len(input_ids)
+
+ # -1 is for eos_token, we don't want to calculate loss on eos token.
+ target_ids[-max_new_tokens - 1 : -1] = input_ids[-max_new_tokens - 1 : -1]
+
+ input_ids_list.append(torch.LongTensor(input_ids))
+ labels_list.append(torch.LongTensor(target_ids))
+
+ # Because of multiple target answers, the final batch size may be greater than self.batch_size.
+ # We will generate new batches.
+ losses = []
+ target_token_nums = []
+
+ batched_input_ids = [
+ input_ids_list[i : i + self.batch_size] for i in range(0, len(input_ids_list), self.batch_size)
+ ]
+ batched_labels = [labels_list[i : i + self.batch_size] for i in range(0, len(labels_list), self.batch_size)]
+
+ for batch_input_ids, batch_labels in zip(batched_input_ids, batched_labels):
+ losses_per_batch, target_token_num_per_batch = self._calculate_loss(batch_input_ids, batch_labels)
+ losses.extend(losses_per_batch)
+ target_token_nums.extend(target_token_num_per_batch)
+
+ start_indice = 0
+ losses_per_sample = []
+
+ target_token_nums_per_sample = []
+ for length in batch_target_nums:
+ losses_per_sample.append(losses[start_indice : start_indice + length])
+ target_token_nums_per_sample.append(target_token_nums[start_indice : start_indice + length])
+ start_indice += length
+
+ return losses_per_sample, target_token_nums_per_sample, None
+
+ def _calculate_loss(self, input_ids_list: List[torch.LongTensor], labels: List[torch.LongTensor]) -> List[float]:
+ """
+ Calculate loss only on target tokens.
+ Hugging Face generate() function can't return per sample loss.
+ It will only return the mean of the loss in a batch.
+ In torch.nn.CrossEntropyLoss(), reduction should be specified as "none" to get per sample loss.
+
+ Args:
+ input_ids_list: A batch of input token ids.
+ labels: A batch of labels.
+
+ Returns:
+ A list of loss.
+
+ """
+ input_ids = torch.nn.utils.rnn.pad_sequence(
+ input_ids_list, batch_first=True, padding_value=self.tokenizer.pad_token_id
+ ).to(torch.cuda.current_device())
+ labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX).to(
+ torch.cuda.current_device()
+ )
+
+ outputs = self.model(input_ids)[0]
+
+ shift_logits = outputs[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+
+ loss_fct = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=IGNORE_INDEX)
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).view(shift_labels.size())
+
+ lens = (labels != IGNORE_INDEX).sum(-1).cpu().numpy()
+
+ loss_sum = loss.sum(-1).to(torch.float32).cpu().detach().numpy()
+ return loss_sum.tolist(), lens.tolist()
+
+
+class ChatGLM2Model(ChatGLMModel):
+ def _get_truncated_prompts(self, inputs: List[str], max_new_tokens: int) -> List[str]:
+ truncated_inputs = copy.deepcopy(inputs)
+ # Adapted from https://github.com/THUDM/ChatGLM2-6B/blob/main/ptuning/main.py#L180
+ for i, input in enumerate(inputs):
+ a_ids = self.tokenizer.encode(text=input, add_special_tokens=True, truncation=False)
+
+ if len(a_ids) > self.model_max_length - max_new_tokens:
+ half = (self.model_max_length - max_new_tokens) // 2
+ prompt = self.tokenizer.decode(a_ids[:half], skip_special_tokens=True) + self.tokenizer.decode(
+ a_ids[-half:], skip_special_tokens=True
+ )
+ truncated_inputs[i] = prompt
+
+ return truncated_inputs
+
+ @torch.no_grad()
+ def generate(self, inputs: List[str], max_new_tokens: int, **kwargs) -> List[str]:
+ """Generate results given a list of inputs and get logits of the first new token over choices.
+
+ Args:
+ inputs: A list of strings.
+ max_new_tokens: Max new tokens for generation.
+ kwargs: Key arguments for generation
+
+ Returns:
+ A list of generated strings and logits over choices.
+
+ Note:
+ Currently the function only returns the logits of the first new token.
+ It is used for single choice question.
+ For multiple choices question, please avoid using the loss over choices.
+ You should set argument choices as None in self.inference().
+
+ """
+ # Follow the process of model.chat() method in modeling_chatglm2.py
+ # See https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1020
+ # See https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1001
+
+ query = []
+ for input in inputs:
+ prompt = self.tokenizer.build_prompt(input, None)
+ query.append(prompt)
+
+ truncated_query = self._get_truncated_prompts(query, max_new_tokens)
+
+ encoded_inputs = self.tokenizer(
+ truncated_query,
+ padding=True,
+ truncation=True,
+ return_tensors="pt",
+ max_length=self.model_max_length - max_new_tokens,
+ ).to(torch.cuda.current_device())
+
+ # Set output_scores=True to get prediction scores.
+ outputs = self.model.generate(
+ **encoded_inputs, max_new_tokens=max_new_tokens, return_dict_in_generate=True, output_scores=True, **kwargs
+ )
+
+ # We only need to decode predicted tokens.
+ sequences = outputs.sequences[:, encoded_inputs["input_ids"].shape[1] :]
+
+ scores = []
+ if self.indices_for_choices:
+ # If the question is a single-choice question, we will return the scores of specific indices for first predicted token.
+ # The indices are the tokenization results of the options for the single-choice question.
+ # For example, if the options of the question are A, B, C and D, we only returns scores at indices of A, B, C and D.
+ for option_indices in self.indices_for_choices:
+ scores.append(outputs.scores[0][:, option_indices].detach().cpu())
+
+ scores = torch.max(torch.stack(scores), dim=0)[0]
+
+ decoded_sequences = self.tokenizer.batch_decode(sequences, skip_special_tokens=True)
+
+ return decoded_sequences, scores
+
+ @torch.no_grad()
+ def get_loss(
+ self, batch_prompt: List[str], batch_target: List[List[str]], pretrain: bool = False
+ ) -> List[List[float]]:
+ """
+ Calculate loss only on target tokens.
+
+ Args:
+ batch: A batch of prompt without target answer.
+ batch_target: A batch of target answer. Sometimes one question can have multiple target answers.
+
+ Returns:
+ Loss.
+
+ """
+
+ # We set max_new_tokens in self._get_truncated_prompts to 0 because we only need logits to calculate loss.
+ # We don't need to generate new tokens.
+ # Target answer's length is usually << model_max_length, but we still call it in case.
+ # We don't call self._get_truncated_prompts for batch_prompt because we need target answer's length first to reserve some space for target answer's tokens.
+ batch_target = [self._get_truncated_prompts(prompt_target, 0) for prompt_target in batch_target]
+
+ # Get the number of target answers for different questions
+ batch_target_nums = [len(prompt_target) for prompt_target in batch_target]
+
+ labels_list = []
+ input_ids_list = []
+
+ for input, targets in zip(batch_prompt, batch_target):
+ for target in targets:
+ # Adapted from https://github.com/THUDM/ChatGLM2-6B/blob/main/ptuning/main.py#L180
+ prompt = self.tokenizer.build_prompt(input, None)
+
+ target_tokenized = self.tokenizer.encode(
+ text=target, add_special_tokens=False, truncation=True, max_length=self.model_max_length
+ )
+
+ max_new_tokens = len(target_tokenized)
+ prompt_with_correct_length = self._get_truncated_prompts([prompt], max_new_tokens)[0]
+ input_tokenized = self.tokenizer.encode(
+ prompt_with_correct_length,
+ add_special_tokens=True,
+ truncation=True,
+ max_length=self.model_max_length,
+ )
+
+ input_ids = input_tokenized + target_tokenized + [self.tokenizer.eos_token_id]
+ target_ids = [IGNORE_INDEX] * len(input_ids)
+
+ # -1 is for "eos"
+ target_ids[-max_new_tokens - 1 : -1] = input_ids[-max_new_tokens - 1 : -1]
+
+ input_ids_list.append(torch.LongTensor(input_ids))
+ labels_list.append(torch.LongTensor(target_ids))
+
+ # Because of multiple target answers, the final batch size may be greater than self.batch_size.
+ # We will generate new batches.
+ losses = []
+ target_token_nums = []
+
+ batched_input_ids = [
+ input_ids_list[i : i + self.batch_size] for i in range(0, len(input_ids_list), self.batch_size)
+ ]
+ batched_labels = [labels_list[i : i + self.batch_size] for i in range(0, len(labels_list), self.batch_size)]
+
+ for batch_input_ids, batch_labels in zip(batched_input_ids, batched_labels):
+ losses_per_batch, target_token_num_per_batch = self._calculate_loss(batch_input_ids, batch_labels)
+ losses.extend(losses_per_batch)
+ target_token_nums.extend(target_token_num_per_batch)
+
+ start_indice = 0
+ losses_per_sample = []
+
+ target_token_nums_per_sample = []
+ for length in batch_target_nums:
+ losses_per_sample.append(losses[start_indice : start_indice + length])
+ target_token_nums_per_sample.append(target_token_nums[start_indice : start_indice + length])
+ start_indice += length
+
+ return losses_per_sample, target_token_nums_per_sample, None
diff --git a/applications/ColossalEval/colossal_eval/models/huggingface.py b/applications/ColossalEval/colossal_eval/models/huggingface.py
new file mode 100644
index 000000000000..693e021533bc
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/models/huggingface.py
@@ -0,0 +1,567 @@
+import copy
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from colossal_eval.utils import Conversation, get_batch_prompt, is_rank_0
+from peft import PeftModel
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseModel
+
+IGNORE_INDEX = -100
+
+
+class HuggingFaceModel(BaseModel):
+ """
+ Model wrapper around HuggingFace AutoModel models.
+
+ Args:
+ path: The path to a HuggingFace model.
+ model_max_length: The maximum sequence length of the model.
+ tokenizer_path: The path to the tokenizer.
+ tokenizer_kwargs: Keyword arguments for the tokenizer.
+ peft_path: The name or path to the HuggingFace's PEFT model.
+ model_kwargs: Keyword arguments for the model.
+ prompt_template: The model's prompt template.
+ batch_size: Batch size for inference.
+ logger: Logger for the model.
+
+ """
+
+ def __init__(
+ self,
+ path: str,
+ model_max_length: int = 2048,
+ tokenizer_path: Optional[str] = None,
+ tokenizer_kwargs: dict = dict(),
+ peft_path: Optional[str] = None,
+ model_kwargs: Dict = None,
+ prompt_template: Conversation = None,
+ batch_size: int = 1,
+ logger: DistributedLogger = None,
+ ):
+ super().__init__(
+ path=path,
+ model_max_length=model_max_length,
+ prompt_template=prompt_template,
+ batch_size=batch_size,
+ logger=logger,
+ )
+ self._load_tokenizer(path=path, tokenizer_path=tokenizer_path, tokenizer_kwargs=tokenizer_kwargs)
+
+ self._load_model(path=path, model_kwargs=model_kwargs, peft_path=peft_path)
+
+ def _get_choices_indices(self, language: str):
+ """
+ Get indices for each choice
+
+ Some tokenizer will insert BOS if you don't specify add_special_tokens=False such as Llama-2.
+ The indices for choices may be different given the context. For example, for Llama-2 tokenizer, for Chinese context like "答案:{choice}", indices for choices A, B, C and D are 29909, 29933, 29907 and 29928, for English context like "Answer: {choice}", indices for choices A, B, C and D are 319, 350, 315 and 360.
+ print(self.tokenizer("答案:A")) to see
+ print(self.tokenizer("Answer: A")) to see
+
+ """
+
+ # A trick for get "all" tokens ids related to given choices.
+ self.indices_for_choices = [[] for _ in range(2)]
+ for choice in self.choices:
+ self.indices_for_choices[0].append(
+ self.tokenizer(f"Answer: {choice}", add_special_tokens=False).input_ids[-1]
+ )
+ self.indices_for_choices[1].append(self.tokenizer(f"答案:{choice}", add_special_tokens=False).input_ids[-1])
+
+ def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], tokenizer_kwargs: dict):
+ """
+ Load tokenizer.
+
+ Args:
+ path: The path to the model. Usually it also serves as the path to the tokenizer.
+ tokenizer_path: The path to the tokenzier.
+ tokenizer_kwargs: Keyword arguments for the tokenizer.
+
+ """
+
+ if self.batch_size > 1:
+ tokenizer_kwargs.update({"padding_side": "left"})
+ tokenizer_kwargs.update({"truncation_side": "left"})
+
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path if tokenizer_path else path, **tokenizer_kwargs)
+
+ if self.tokenizer.pad_token_id is None:
+ self.logger.warning("pad_token_id is not set for the tokenizer. " "Using eos_token_id as pad_token_id.")
+ if self.tokenizer.eos_token:
+ self.tokenizer.pad_token = self.tokenizer.eos_token
+ elif hasattr(self.tokenizer, "eod_id"):
+ # Qwen has an eod token "<|endoftext|>".
+ self.tokenizer.pad_token_id = self.tokenizer.eod_id
+
+ def _load_model(self, path: str, model_kwargs: dict, peft_path: Optional[str] = None):
+ """
+ Load model.
+
+ Args:
+ path: The path to the model.
+ model_kwargs: Keyword arguments for the model.
+ peft_path: The path to the peft model.
+
+ """
+
+ if "torch_dtype" in model_kwargs:
+ model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"])
+
+ model_kwargs.setdefault("torch_dtype", torch.float16)
+
+ self.model = AutoModel.from_pretrained(path, **model_kwargs).to(torch.cuda.current_device())
+ if peft_path is not None:
+ self.model = PeftModel.from_pretrained(self.model, peft_path, is_trainable=False)
+ self.model.eval()
+
+ def _calculate_loss(self, input_ids_list: List[torch.LongTensor], labels: List[torch.LongTensor]) -> Tuple[List]:
+ """
+ Calculate loss only on target tokens.
+ Hugging Face generate() function can't return per sample loss.
+ It will only return the mean of the loss in a batch.
+ In torch.nn.CrossEntropyLoss(), reduction should be specified as "none" to get per sample loss.
+
+ Args:
+ input_ids_list: A batch of input token ids.
+ labels: A batch of labels.
+
+ Returns:
+ A list of loss.
+
+ """
+ input_ids = torch.nn.utils.rnn.pad_sequence(
+ input_ids_list, batch_first=True, padding_value=self.tokenizer.pad_token_id
+ ).to(torch.cuda.current_device())
+ labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX).to(
+ torch.cuda.current_device()
+ )
+ attention_mask = input_ids.ne(self.tokenizer.pad_token_id).to(torch.cuda.current_device())
+
+ outputs = self.model(input_ids, attention_mask=attention_mask)[0]
+
+ shift_logits = outputs[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+
+ loss_fct = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=IGNORE_INDEX)
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).view(shift_labels.size())
+
+ lens = (labels != IGNORE_INDEX).sum(-1).cpu().numpy()
+
+ loss_sum = loss.sum(-1).to(torch.float32).cpu().detach().numpy()
+ return loss_sum.tolist(), lens.tolist()
+
+ def _get_truncated_prompts(self, inputs: List[str], max_new_tokens: int) -> List[str]:
+ """
+ Truncate the input sequence to fit model_max_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions)
+ https://github.com/THUDM/LongBench/blob/main/pred.py#L16
+
+ Args:
+ inputs: A batch of input prompts.
+ max_new_tokens: Max new tokens for model to generate.
+
+ Returns:
+ Truncated prompts.
+
+ """
+
+ truncated_inputs = copy.deepcopy(inputs)
+ for i, input in enumerate(inputs):
+ tokenized_prompt = self.tokenizer(input, truncation=False, return_tensors="pt").input_ids[0]
+ if len(tokenized_prompt) > self.model_max_length - max_new_tokens:
+ half = (self.model_max_length - max_new_tokens) // 2
+ prompt = self.tokenizer.decode(
+ tokenized_prompt[:half], skip_special_tokens=True
+ ) + self.tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
+ truncated_inputs[i] = prompt
+
+ return truncated_inputs
+
+ def _get_input_ids_and_labels_pretrain(self, batch_prompt: List[str]) -> Tuple[List[torch.LongTensor]]:
+ """
+ Get input_ids and labels for pretrain data.
+ We only need batch_prompt because for pretain dataset, we don't need to predict new tokens.
+
+ Args:
+ batch_prompt: A batch of prompt.
+
+ Returns:
+ Input_ids and labels for the given batch.
+
+ """
+ input_ids_list = []
+ labels_list = []
+ bytes_list = []
+
+ for input in batch_prompt:
+ # Pretrain data tends to be very long, sometimes much larger than the model_max_length, we only tokenize 1/ratio of the data first to accelerate the tokenization process.
+ # Once the length of the result is greater or equal to model_max_length, we stop iterating on ratios and use the result as input_ids and labels.
+ # After all, the rest of the original string doesn't need to be tokenized at the first place.
+ ratio = [16, 8, 4, 2, 1]
+ tokenized = None
+ for r in ratio:
+ tokenized = self.tokenizer(
+ [input[0 : len(input) // r]], truncation=True, max_length=self.model_max_length, return_tensors="pt"
+ )
+ if tokenized.input_ids.size(1) >= self.model_max_length:
+ break
+
+ input_ids = copy.deepcopy(tokenized["input_ids"])[0]
+ target_ids = copy.deepcopy(input_ids)
+
+ string = self.tokenizer.decode(tokenized.input_ids[0], skip_special_tokens=True)
+
+ bytes_list.append(len(string.encode("utf-8")))
+
+ input_ids_list.append(input_ids)
+ labels_list.append(target_ids)
+
+ return input_ids_list, labels_list, bytes_list
+
+ def _get_input_ids_and_labels(
+ self, batch_prompt: List[str], batch_target: List[List[str]], pretrain: bool
+ ) -> Tuple[List[torch.LongTensor]]:
+ """
+ Get input_ids and labels for the given data.
+
+ Args:
+ batch_prompt: A batch of prompt.
+ batch_target: A batch of target.
+
+ Returns:
+ Input_ids and labels for the given batch.
+
+ """
+ if pretrain:
+ return self._get_input_ids_and_labels_pretrain(batch_prompt)
+
+ input_ids_list = []
+ labels_list = []
+
+ for input, targets in zip(batch_prompt, batch_target):
+ for target in targets:
+ # TODO: Improve the labeling process. Should annotate the border by adding special tokens.
+ target_tokenized = self.tokenizer(
+ [target], truncation=True, max_length=self.model_max_length, return_tensors="pt"
+ )
+
+ # Get prompt with length model_max_length - len(target_tokenized).
+ # Reserve some space for target answer tokens using max_new_tokens.
+ # This will generate the correct start_idx and end_idx.
+ max_new_tokens = target_tokenized["input_ids"][0].size(0)
+ prompt_with_correct_length = self._get_truncated_prompts([input], max_new_tokens)[0]
+ input_tokenized = self.tokenizer(
+ [prompt_with_correct_length],
+ truncation=True,
+ max_length=self.model_max_length - max_new_tokens,
+ return_tensors="pt",
+ )
+
+ target_tokenized = self.tokenizer(
+ [prompt_with_correct_length + target],
+ truncation=True,
+ max_length=self.model_max_length,
+ return_tensors="pt",
+ )
+
+ start_idx = input_tokenized["input_ids"][0].size(0)
+ end_idx = target_tokenized["input_ids"][0].size(0)
+
+ # Sometimes if the target is only an option such as A, B, C and D, the length of input_tokenized is equal to the length of target_tokenized, so we need -1.
+ # This is caused by the different behavior of tokenizers.
+ # For example, the tokenizer for Baichuan and Llama will cause such problem in a plain prompt setting.
+ # The length of the tokenized sequences for prompt "Answer: " and "Answer: A" is the same.
+ # Baichuan: [29394, 31143, 31106] [29394, 31143, 703]
+ # Llama: [673, 29901, 29871] [673, 29901, 319]
+ # The length for sequence "prompt" and "prompt + A" is equal.
+ # For ChatGLM, the length of the tokenized sequences is different.
+ # ChatGLM: [16583, 12] [16583, 12, 167]
+
+ if start_idx == end_idx:
+ start_idx -= 1
+
+ input_ids = copy.deepcopy(target_tokenized["input_ids"])[0]
+ target_ids = copy.deepcopy(input_ids)
+
+ mask = torch.zeros_like(target_ids, dtype=torch.bool)
+ mask[start_idx:end_idx] = True
+
+ target_ids[~mask] = IGNORE_INDEX
+
+ input_ids_list.append(input_ids)
+ labels_list.append(target_ids)
+
+ return input_ids_list, labels_list, None
+
+ def inference(self, data: List[Dict], inference_kwargs: Dict[str, Any], debug: bool = False) -> List[Dict]:
+ """
+ Infer the given data.
+ This function will call self.generate() to get model outputs and also self.model() to get logits.
+
+ Args:
+ data: The data for inference.
+ inference_kwargs: Arguments for inference.
+ debug: Whether to display generated prompt for debugging.
+
+ Returns:
+ Inference results.
+
+ """
+ calculate_loss = inference_kwargs["calculate_loss"]
+ classes = inference_kwargs["all_classes"]
+ language = inference_kwargs["language"]
+ pretrain = inference_kwargs["pretrain"]
+ max_new_tokens = inference_kwargs["max_new_tokens"]
+ few_shot_data = inference_kwargs.get("few_shot_data", None)
+
+ # Some classification questions' options are texts not a single letter such as A, B, C and D.
+ # If the text length is greater than 1, we won't calculate loss over choices.
+ if classes is not None and any(len(c) > 1 for c in classes):
+ classes = None
+
+ self.choices = classes
+ self.indices_for_choices = None
+ if self.choices:
+ # Get indices for each choice
+ self._get_choices_indices(language)
+
+ self.str_label_map = {choice: idx for idx, choice in enumerate(self.choices)}
+
+ turn = 0 if not isinstance(data[0]["output"], list) else len(data[0]["output"]) + 1
+ turn_desc = "" if turn == 0 else f"-turn{turn}"
+
+ bar = tqdm(
+ range(math.ceil(len(data) / self.batch_size)),
+ desc=f"{data[0]['dataset']}-{data[0]['category']}{turn_desc} Inference steps",
+ disable=not is_rank_0(),
+ )
+ loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+ answers = copy.deepcopy(data)
+ for i in range(0, len(data), self.batch_size):
+ batch = data[i : i + self.batch_size]
+ batch_prompt, batch_target = get_batch_prompt(
+ self.prompt_template, batch, few_shot_data, self.tokenizer, language, self.model_max_length
+ )
+
+ if is_rank_0() and debug and i == 0:
+ self.logger.info(
+ f"Inference arguments for dataset {data[0]['dataset']} category {data[0]['category']} is:\n{inference_kwargs}"
+ )
+ self.logger.info("-" * 120)
+ self.logger.info("An example prompt and prompt with target is:")
+ self.logger.info("-" * 120)
+ self.logger.info(batch_prompt[0])
+ self.logger.info("-" * 120)
+ self.logger.info(batch_prompt[0] + batch_target[0][0])
+
+ if not pretrain:
+ batch_decodes, scores = self.generate(batch_prompt, max_new_tokens)
+
+ if calculate_loss:
+ batch_losses, batch_target_token_nums, batch_bytes_nums = self.get_loss(
+ batch_prompt, batch_target, pretrain
+ )
+
+ probs = []
+ if self.indices_for_choices:
+ scores = scores.to(torch.float32)
+ # If we have indices_for_choices(must be single-choice question), there will be only one target answer for one data sample.
+ # Otherwise this will violate the single-choice setting.
+
+ if calculate_loss:
+ labels = [self.str_label_map[answers[i + j]["target"]] for j in range(len(batch_decodes))]
+
+ loss_over_choices = loss_fct(scores, torch.tensor(labels, dtype=torch.long)).numpy().tolist()
+
+ probs = torch.nn.functional.softmax(scores, dim=-1).numpy().tolist()
+ probs = [
+ {choice: probs[i][self.str_label_map[choice]] for choice in self.choices} for i in range(len(probs))
+ ]
+
+ for j in range(len(batch_prompt)):
+ if not pretrain:
+ if isinstance(answers[i + j]["output"], list):
+ answers[i + j]["output"].append(batch_decodes[j].strip())
+ else:
+ answers[i + j]["output"] = batch_decodes[j].strip()
+
+ if isinstance(scores, torch.Tensor):
+ answers[i + j]["softmax_over_choices"] = probs[j]
+
+ if calculate_loss:
+ answers[i + j]["loss_over_choices"] = loss_over_choices[j]
+
+ if calculate_loss:
+ answers[i + j]["loss"] = (np.array(batch_losses[j]) / np.array(batch_target_token_nums[j])).tolist()
+
+ # loss_sum is specially used for pertrain dataset for calculating per-byte-perplexity.
+ # However, loss (which is per sample loss) suffices for most cases.
+ answers[i + j]["loss_sum"] = batch_losses[j]
+ answers[i + j]["token_num"] = batch_target_token_nums[j]
+
+ if batch_bytes_nums:
+ answers[i + j]["byte_num"] = batch_bytes_nums[j]
+
+ bar.update()
+
+ return answers
+
+ @torch.no_grad()
+ def generate(self, inputs: List[str], max_new_tokens: int, **kwargs) -> List[str]:
+ """Generate results given a list of inputs and get logits of the first new token over choices.
+
+ Args:
+ inputs: A list of strings.
+ max_new_tokens: Max new tokens for generation.
+ kwargs: Key arguments for generation
+
+ Returns:
+ A list of generated strings and logits over choices.
+
+ Note:
+ Currently the function only returns the logits of the first new token.
+ It is used for single choice question.
+ For multiple choices question, please avoid using the loss over choices.
+ You should set argument choices as None in self.inference().
+
+ """
+ truncated_inputs = self._get_truncated_prompts(inputs, max_new_tokens)
+
+ encoded_inputs = self.tokenizer(
+ truncated_inputs,
+ padding=True,
+ truncation=True,
+ return_tensors="pt",
+ return_token_type_ids=False,
+ max_length=self.model_max_length - max_new_tokens,
+ ).to(torch.cuda.current_device())
+
+ # Set output_scores=True to get prediction scores.
+ outputs = self.model.generate(
+ **encoded_inputs, max_new_tokens=max_new_tokens, return_dict_in_generate=True, output_scores=True, **kwargs
+ )
+
+ # We only need to decode predicted tokens.
+ sequences = outputs.sequences[:, encoded_inputs["input_ids"].shape[1] :]
+
+ scores = []
+ if self.indices_for_choices:
+ # If the question is a single-choice question, we will return the scores of specific indices for first predicted token.
+ # The indices are the tokenization results of the options for the single-choice question.
+ # For example, if the options of the question are A, B, C and D, we only returns scores at indices of A, B, C and D.
+ for option_indices in self.indices_for_choices:
+ scores.append(outputs.scores[0][:, option_indices].detach().cpu())
+
+ scores = torch.max(torch.stack(scores), dim=0)[0]
+
+ decoded_sequences = self.tokenizer.batch_decode(sequences, skip_special_tokens=True)
+
+ return decoded_sequences, scores
+
+ @torch.no_grad()
+ def get_loss(self, batch_prompt: List[str], batch_target: List[List[str]], pretrain: bool) -> List[List[float]]:
+ """
+ Calculate loss only on target tokens.
+
+ Args:
+ batch: A batch of prompt without target answer.
+ batch_target: A batch of target answer. Sometimes one question can have multiple target answers.
+
+ Returns:
+ Loss.
+
+ """
+
+ # We set max_new_tokens in self._get_truncated_prompts to 0 because we only need logits to calculate loss.
+ # We don't need to generate new tokens.
+ # Target answer's length is usually << model_max_length, but we still call it in case.
+ # We don't call self._get_truncated_prompts for batch_prompt because we need target answer's length first to reserve some space for target answer's tokens.
+ if not pretrain:
+ batch_target = [self._get_truncated_prompts(prompt_target, 0) for prompt_target in batch_target]
+
+ # Get the number of target answers for different questions
+ batch_target_nums = [len(prompt_target) for prompt_target in batch_target]
+
+ input_ids_list, labels_list, bytes_list = self._get_input_ids_and_labels(batch_prompt, batch_target, pretrain)
+
+ # Because of multiple target answers, the final batch size may be greater than self.batch_size.
+ # We will generate new batches.
+ losses = []
+ target_token_nums = []
+
+ batched_input_ids = [
+ input_ids_list[i : i + self.batch_size] for i in range(0, len(input_ids_list), self.batch_size)
+ ]
+ batched_labels = [labels_list[i : i + self.batch_size] for i in range(0, len(labels_list), self.batch_size)]
+
+ for batch_input_ids, batch_labels in zip(batched_input_ids, batched_labels):
+ losses_per_batch, target_token_num_per_batch = self._calculate_loss(batch_input_ids, batch_labels)
+ losses.extend(losses_per_batch)
+ target_token_nums.extend(target_token_num_per_batch)
+
+ start_indice = 0
+ losses_per_sample = []
+
+ target_token_nums_per_sample = []
+ bytes_nums_per_sample = []
+ for length in batch_target_nums:
+ losses_per_sample.append(losses[start_indice : start_indice + length])
+ target_token_nums_per_sample.append(target_token_nums[start_indice : start_indice + length])
+
+ if bytes_list:
+ bytes_nums_per_sample.append(bytes_list[start_indice : start_indice + length])
+
+ start_indice += length
+
+ if bytes_list:
+ return losses_per_sample, target_token_nums_per_sample, bytes_nums_per_sample
+
+ return losses_per_sample, target_token_nums_per_sample, None
+
+
+class HuggingFaceCausalLM(HuggingFaceModel):
+ """
+ Model wrapper around HuggingFace AutoModelForCausalLM models.
+
+ Args:
+ path: The path to a HuggingFace model.
+ model_max_length: The maximum sequence length of the model.
+ tokenizer_path: The path to the tokenizer.
+ tokenizer_kwargs: Keyword arguments for the tokenizer.
+ peft_path: The name or path to the HuggingFace's PEFT model.
+ model_kwargs: Keyword arguments for the model.
+ prompt_template: The model's prompt template.
+ batch_size: Batch size for inference.
+ logger: Logger for the model.
+
+ """
+
+ def _load_model(self, path: str, model_kwargs: dict, peft_path: Optional[str] = None):
+ """
+ Load model.
+
+ Args:
+ path: The path to the model.
+ model_kwargs: Keyword arguments for the model.
+ peft_path: The path to the peft model.
+
+ """
+
+ if "torch_dtype" in model_kwargs:
+ model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"])
+
+ if "config" in model_kwargs:
+ model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"])
+
+ model_kwargs.setdefault("torch_dtype", torch.float16)
+ self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs).to(torch.cuda.current_device())
+ if peft_path is not None:
+ self.model = PeftModel.from_pretrained(self.model, peft_path, is_trainable=False)
+ self.model.eval()
diff --git a/applications/ColossalEval/colossal_eval/utils/__init__.py b/applications/ColossalEval/colossal_eval/utils/__init__.py
new file mode 100644
index 000000000000..d5ee6e13b747
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/utils/__init__.py
@@ -0,0 +1,4 @@
+from .conversation import Conversation, get_batch_prompt, prompt_templates
+from .utilities import get_json_list, is_rank_0, jdump, jload
+
+__all__ = ["Conversation", "prompt_templates", "get_batch_prompt", "is_rank_0", "jload", "jdump", "get_json_list"]
diff --git a/applications/ColossalEval/colossal_eval/utils/conversation.py b/applications/ColossalEval/colossal_eval/utils/conversation.py
new file mode 100644
index 000000000000..54ea212466d4
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/utils/conversation.py
@@ -0,0 +1,247 @@
+import dataclasses
+from enum import Enum, auto
+from typing import Dict, List, Optional, Tuple
+
+from transformers import AutoTokenizer
+
+
+class SeparatorStyle(Enum):
+ ADD_BOS_EOS_TOKEN = auto()
+ ALPACA = auto()
+ PLAIN = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+ system: str
+ roles: List[str]
+ messages: List[List[str]]
+ offset: int
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_BOS_EOS_TOKEN
+ sep: str = ""
+
+ def clear(self):
+ self.messages = []
+
+ def get_prompt(self):
+ if self.sep_style == SeparatorStyle.ADD_BOS_EOS_TOKEN:
+ ret = self.system
+ for role, message in self.messages:
+ if message:
+ ret += role + ": " + "" + message + self.sep
+ else:
+ ret += role + ": " + ""
+ return ret
+ elif self.sep_style == SeparatorStyle.ALPACA:
+ ret = self.system + self.sep
+ for role, message in self.messages:
+ if message:
+ ret += role + ":\n" + message + self.sep
+ else:
+ ret += role + ":"
+ return ret
+ elif self.sep_style == SeparatorStyle.PLAIN:
+ ret = self.system
+ for role, message in self.messages:
+ if message:
+ ret += message
+ else:
+ ret += ""
+ return ret
+ else:
+ raise ValueError(f"Invalid style: {self.sep_style}")
+
+ def get_prompt_with_target(self, target):
+ prompt = self.get_prompt()
+ prompt_with_target = []
+
+ # Some dataset provides multiple target answers.
+ # This will make it difficult when we calculate loss.
+ # We convert target into list[str] first if the question only has one target answer.
+ target_answers = []
+ if isinstance(target, str):
+ target_answers = [target]
+ else:
+ target_answers = target
+
+ for target_answer in target_answers:
+ if self.sep_style == SeparatorStyle.ADD_BOS_EOS_TOKEN:
+ prompt_with_target.append(prompt + target_answer)
+ elif self.sep_style == SeparatorStyle.ALPACA:
+ prompt_with_target.append(prompt + target_answer)
+ elif self.sep_style == SeparatorStyle.PLAIN:
+ prompt_with_target.append(prompt + target_answer)
+ else:
+ raise ValueError(f"Invalid style: {self.sep_style}")
+
+ return prompt_with_target
+
+ def save_prompt(self):
+ if self.sep_style == SeparatorStyle.ADD_BOS_EOS_TOKEN:
+ ret = self.system
+ for role, message in self.messages:
+ if message:
+ ret += role + ": " + "" + message + "\n"
+ else:
+ ret += role + ": " + ""
+ return ret
+ else:
+ raise ValueError(f"Invalid style: {self.sep_style}")
+
+ def append_message(self, role, message):
+ self.messages.append([role, message])
+
+ def copy(self):
+ return Conversation(
+ system=self.system,
+ roles=self.roles,
+ messages=[[x, y] for x, y in self.messages],
+ offset=self.offset,
+ sep_style=self.sep_style,
+ sep=self.sep,
+ )
+
+ def dict(self):
+ return {
+ "system": self.system,
+ "roles": self.roles,
+ "messages": self.messages,
+ "offset": self.offset,
+ "sep_style": self.sep_style,
+ "sep": self.sep,
+ }
+
+
+def get_few_shot_prefix(
+ conv: Conversation, few_shot_data: List[str], tokenizer: Optional[AutoTokenizer], language: str, max_tokens: int
+) -> str:
+ """
+ Get few shot prefix.
+
+ Args:
+ conv: Conversation template.
+ few_shot_examples: Few shot examples to generate few shot prompt prefix.
+
+ Returns:
+ Few shot prompt prefix.
+ """
+
+ if language == "English":
+ few_shot_prefix = f"The following are answers for questions in an exam.\n\n"
+ elif language == "Chinese":
+ few_shot_prefix = f"以下是考试中各个问题的答案。\n\n"
+
+ output = None
+ for i in range(len(few_shot_data)):
+ few_shot_prefix = few_shot_prefix + few_shot_data[i] + "\n\n"
+
+ if len(tokenizer([few_shot_prefix]).input_ids[0]) <= max_tokens:
+ output = few_shot_prefix
+ else:
+ break
+
+ return output if output is not None else few_shot_prefix
+
+
+def get_batch_prompt(
+ conv: Conversation,
+ batch: List[Dict],
+ few_shot_data: List[str],
+ tokenizer: Optional[AutoTokenizer],
+ language: Optional[str],
+ model_max_length: Optional[int],
+) -> Tuple[List[Dict], List[Dict]]:
+ """
+ Get batch prompt and target.
+
+ Args:
+ conv: Conversation template.
+ batch: Batch data to generate prompt from.
+ few_shot_data: Few shot data to generate few shot prompt prefix.
+
+ Returns:
+ Tuple containg batch prompt and target.
+
+ """
+
+ batch_prompt = []
+ batch_target = []
+
+ if isinstance(batch[0], dict):
+ for b in batch:
+ few_shot_prefix = ""
+ if few_shot_data is not None:
+ assert not isinstance(b["instruction"], list), print(
+ f"When performing few-shot, {b['dataset']} shouldn't be a multiturn dataset."
+ )
+ # For few-shot, only need input. Otherwise use instruction (in AGIEval).
+ query_text = b["input"] if b.get("input", "") != "" else b["instruction"]
+
+ if isinstance(b["target"], str):
+ zero_shot_prompt = query_text + b["target"]
+ max_tokens = model_max_length - len(tokenizer([zero_shot_prompt]).input_ids[0])
+ else:
+ raise Exception("When using few-shot, target answer should be a string.")
+
+ few_shot_prefix = get_few_shot_prefix(conv, few_shot_data, tokenizer, language, max_tokens)
+
+ conv.append_message(conv.roles[0], few_shot_prefix + query_text)
+ conv.append_message(conv.roles[1], None)
+ else:
+ if not isinstance(b["instruction"], list):
+ query_text = (
+ b["instruction"] + "\n\n" + b["input"] if b.get("input", "") != "" else b["instruction"]
+ )
+ conv.append_message(conv.roles[0], query_text)
+ conv.append_message(conv.roles[1], None)
+ else:
+ assert len(b["instruction"]) >= len(b["output"]) + 1
+ cur_turns = len(b["output"])
+ for turn in range(cur_turns):
+ conv.append_message(conv.roles[0], b["instruction"][turn])
+ conv.append_message(conv.roles[1], b["output"][turn])
+ conv.append_message(conv.roles[0], b["instruction"][cur_turns])
+ conv.append_message(conv.roles[1], None)
+
+ batch_prompt.append(conv.get_prompt())
+
+ target = b["target"]
+ if isinstance(b["target"], str):
+ target = [target]
+
+ batch_target.append(target)
+
+ conv.clear()
+
+ return batch_prompt, batch_target
+
+
+conv_coati = Conversation(
+ system="A chat between a curious human and an artificial intelligence assistant. "
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+ roles=("Human", "Assistant"),
+ messages=[],
+ offset=0,
+ sep_style=SeparatorStyle.ADD_BOS_EOS_TOKEN,
+ sep="",
+)
+
+conv_alpaca = Conversation(
+ system="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+ roles=("### Instruction", "### Response"),
+ messages=[],
+ offset=0,
+ sep_style=SeparatorStyle.ALPACA,
+ sep="\n\n",
+)
+
+conv_plain = Conversation(
+ system="",
+ roles=("", ""),
+ messages=[],
+ offset=0,
+ sep_style=SeparatorStyle.PLAIN,
+ sep="",
+)
+
+prompt_templates = {"coati": conv_coati, "alpaca": conv_alpaca, "plain": conv_plain}
diff --git a/applications/ColossalEval/colossal_eval/utils/utilities.py b/applications/ColossalEval/colossal_eval/utils/utilities.py
new file mode 100644
index 000000000000..4eda07907495
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/utils/utilities.py
@@ -0,0 +1,62 @@
+import io
+import json
+import os
+
+import torch.distributed as dist
+
+
+def is_rank_0() -> bool:
+ return not dist.is_initialized() or dist.get_rank() == 0
+
+
+def _make_w_io_base(f, mode: str):
+ if not isinstance(f, io.IOBase):
+ f_dirname = os.path.dirname(f)
+ if f_dirname != "":
+ os.makedirs(f_dirname, exist_ok=True)
+ f = open(f, mode=mode, encoding="utf-8")
+ return f
+
+
+def _make_r_io_base(f, mode: str):
+ if not isinstance(f, io.IOBase):
+ f = open(f, mode=mode, encoding="utf-8")
+ return f
+
+
+def jdump(obj, f, mode="w", indent=4, default=str):
+ """
+ Dump a str or dictionary to a file in json format.
+
+ Args:
+ obj: An object to be written.
+ f: A string path to the location on disk.
+ mode: Mode for opening the file.
+ indent: Indent for storing json dictionaries.
+ default: A function to handle non-serializable entries; defaults to `str`.
+
+ """
+ f = _make_w_io_base(f, mode)
+ if isinstance(obj, (dict, list)):
+ json.dump(obj, f, indent=indent, default=default, ensure_ascii=False)
+ elif isinstance(obj, str):
+ f.write(obj)
+ else:
+ raise ValueError(f"Unexpected type: {type(obj)}")
+ f.close()
+
+
+def jload(f, mode="r"):
+ """Load a .json file into a dictionary."""
+ f = _make_r_io_base(f, mode)
+ jdict = json.load(f)
+ f.close()
+ return jdict
+
+
+def get_json_list(file_path):
+ with open(file_path, "r") as f:
+ json_list = []
+ for line in f:
+ json_list.append(json.loads(line if line != "null" else line))
+ return json_list
diff --git a/applications/ColossalEval/configs/gpt_evaluation/config/config_cn.json b/applications/ColossalEval/configs/gpt_evaluation/config/config_cn.json
new file mode 100644
index 000000000000..d7c864881008
--- /dev/null
+++ b/applications/ColossalEval/configs/gpt_evaluation/config/config_cn.json
@@ -0,0 +1,44 @@
+{
+ "language": "cn",
+ "category": {
+ "brainstorming": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "creativity",
+ "practicality",
+ "reasonableness"
+ ]
+ },
+ "chat": {
+ "GPT": [
+ "language organization",
+ "naturalness",
+ "engagingness",
+ "fidelity"
+ ]
+ },
+ "generation": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "diversity"
+ ]
+ },
+ "open_qa": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "correctness"
+ ]
+ },
+ "roleplay": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "fidelity",
+ "creativity"
+ ]
+ }
+ }
+}
diff --git a/applications/ColossalEval/configs/gpt_evaluation/config/config_en.json b/applications/ColossalEval/configs/gpt_evaluation/config/config_en.json
new file mode 100644
index 000000000000..6ebe3996b1cf
--- /dev/null
+++ b/applications/ColossalEval/configs/gpt_evaluation/config/config_en.json
@@ -0,0 +1,44 @@
+{
+ "language": "en",
+ "category": {
+ "brainstorming": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "creativity",
+ "practicality",
+ "reasonableness"
+ ]
+ },
+ "chat": {
+ "GPT": [
+ "language organization",
+ "naturalness",
+ "engagingness",
+ "fidelity"
+ ]
+ },
+ "generation": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "diversity"
+ ]
+ },
+ "open_qa": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "correctness"
+ ]
+ },
+ "roleplay": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "fidelity",
+ "creativity"
+ ]
+ }
+ }
+}
diff --git a/applications/ColossalEval/configs/gpt_evaluation/data/eval_cn_examples.json b/applications/ColossalEval/configs/gpt_evaluation/data/eval_cn_examples.json
new file mode 100644
index 000000000000..f869830555b4
--- /dev/null
+++ b/applications/ColossalEval/configs/gpt_evaluation/data/eval_cn_examples.json
@@ -0,0 +1,202 @@
+[
+ {
+ "category": "brainstorming",
+ "instruction": "列举一些可以促进头发生长的食物。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 1
+ },
+ {
+ "category": "brainstorming",
+ "instruction": "中年夫妻如何提升夫妻感情,请给出三个实用的的方法,并举例说明。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 2
+ },
+ {
+ "category": "brainstorming",
+ "instruction": "请列举4种日常的环保行为。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 3
+ },
+ {
+ "category": "brainstorming",
+ "instruction": "请给出5个可以随时随地锻炼身体的小动作。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 4
+ },
+ {
+ "category": "brainstorming",
+ "instruction": "请问如何制作一份美味的西红柿炒鸡蛋?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 5
+ },
+ {
+ "category": "chat",
+ "instruction": "基于以下角色信息完成一段对话。小张是一名新手爱好者,对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。",
+ "input": "小张:您好,老李,我最近开始对养鸡感兴趣了,想请教您一些问题。 老李:你好,小张,我很乐意帮助你。你想问些什么? 小张:我想知道如何确定鸡的品种和性别? 老李:确切的品种可以通过鸡的外貌特征来确定,而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗? 小张:",
+ "output": "",
+ "target": "",
+ "id": 6
+ },
+ {
+ "category": "chat",
+ "instruction": "基于以下角色信息完成一段对话。李华是一名参加了期末考试的学生,他已经很担心自己的考试成绩。老师Lucy正在帮助他度过这个紧张的时刻。",
+ "input": "李华:Lucy老师,我很担心自己的考试成绩,我不知道我是否能够通过这次考试。 Lucy:放松,李华,你已经做好了充分的准备。相信你自己,你会做得很好的。 李华:我很怕考试时会忘记自己所学的知识。 Lucy:你可以预留一些时间,过一遍自己所学的知识点或笔记,这样你会更有信心和准确地回答考题。 李华:如果我还是失败了,该怎么办? Lucy:",
+ "output": "",
+ "target": "",
+ "id": 7
+ },
+ {
+ "category": "chat",
+ "instruction": "基于以下角色信息完成一段对话。张先生是一名企业家,正在考虑是否开拓海外市场;李女士是一名跨境电商专家,擅长国际商务和电子商务。",
+ "input": "张先生:你好,李女士,我正在考虑将我们的产品销售扩大至海外市场,您有什么建议吗? 李女士:您好,张先生,我们需要考虑到海外市场对于产品的需求是否与国内市场一致,需要进行市场调研和定位。然后再进行各种软性、硬性的创新。 张先生:听起来很专业,您能具体解释一下吗? 李女士:",
+ "output": "",
+ "target": "",
+ "id": 8
+ },
+ {
+ "category": "chat",
+ "instruction": "基于以下角色信息完成一段对话。小明是一名医生。一名病患想要提前停药。小王是病患的儿子,希望父亲能够听取医生的建议。",
+ "input": "小明:你好,小王,我了解你想要让你父亲停药。小王:是的,我父亲已经吃了那么久的药,我担心药物对他的身体会有副作用。小明:",
+ "output": "",
+ "target": "",
+ "id": 9
+ },
+ {
+ "category": "chat",
+ "instruction": "基于以下角色信息完成一段对话。张三是一位语文老师,对学生认真负责;李四是张三的学生,对语文兴趣不是很高。",
+ "input": "张三:同学们,今天要讲的是一篇古文《岳阳楼记》。这篇文章非常精彩,希望同学们能够认真听课,理解其中的含义。 李四:怎么又是古文? 张三:",
+ "output": "",
+ "target": "",
+ "id": 10
+ },
+ {
+ "category": "generation",
+ "instruction": "根据主题写一封邮件。",
+ "input": "主题: \"加入我们,共创未来\"",
+ "output": "",
+ "target": "",
+ "id": 11
+ },
+ {
+ "category": "generation",
+ "instruction": "为公司编写一份职场行为准则,包括明确的行为规范和道德准则。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 12
+ },
+ {
+ "category": "generation",
+ "instruction": "请撰写一篇文章,介绍如何通过改善生活习惯来预防疾病和延长寿命。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 13
+ },
+ {
+ "category": "generation",
+ "instruction": "请为一家咖啡店编写一篇简短的广告语,吸引更多的顾客。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 14
+ },
+ {
+ "category": "generation",
+ "instruction": "根据以下故事提示写一篇故事:",
+ "input": "故事提示:```在一个废弃的古堡中,一个小女孩遇到了一只会说话的黑猫,他们一起揭开了一个古老的谜题。```",
+ "output": "",
+ "target": "",
+ "id": 15
+ },
+ {
+ "category": "open_qa",
+ "instruction": "请介绍一下《红楼梦》这部经典小说的故事情节。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 16
+ },
+ {
+ "category": "open_qa",
+ "instruction": "解释什么是RNA病毒和DNA病毒。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 17
+ },
+ {
+ "category": "open_qa",
+ "instruction": "什么是比特币?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 18
+ },
+ {
+ "category": "open_qa",
+ "instruction": "在计算机中,什么是RAM?与ROM有什么区别?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 19
+ },
+ {
+ "category": "open_qa",
+ "instruction": "请简单介绍一下世界上最长的河流途经的国家。",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 20
+ },
+ {
+ "category": "roleplay",
+ "instruction": "我要你把我写的句子翻译成表情符号。我会写句子,你会用表情符号表达它。我只是想让你用表情符号来表达它。除了表情符号,我不希望你回复任何内容。当我需要用中文告诉你一些事情时,我会用 {} 这样的大括号括起来。我的第一句话是“{我的职业是消防员。}”\n",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 21
+ },
+ {
+ "category": "roleplay",
+ "instruction": "我希望你假定自己是雅思写作考官,根据雅思评判标准,按我给你的雅思考题和对应答案给我评分,并且按照雅思写作评分细则给出打分依据。此外,请给我详细的修改意见并写出满分范文。第一个问题是:It is sometimes argued that too many students go to university, while others claim that a university education should be a universal right. Discuss both sides of the argument and give your own opinion.对于这个问题,我的答案是:In some advanced countries, it is not unusual for more than 50% of young adults to attend college or university. Critics, however, claim that many university courses are worthless and young people would be better off gaining skills in the workplace. In this essay, I will examine both sides of this argument and try to reach a conclusion.There are several reasons why young people today believe they have the right to a university education. First, growing prosperity in many parts of the world has increased the number of families with money to invest in their children’s future. At the same time, falling birthrates mean that one- or two-child families have become common, increasing the level of investment in each child. It is hardly surprising, therefore, that young people are willing to let their families support them until the age of 21 or 22. Furthermore, millions of new jobs have been created in knowledge industries, and these jobs are typically open only to university graduates.However, it often appears that graduates end up in occupations unrelated to their university studies. It is not uncommon for an English literature major to end up working in sales, or an engineering graduate to retrain as a teacher, for example. Some critics have suggested that young people are just delaying their entry into the workplace, rather than developing professional skills.请依次给到我以下内容:具体分数及其评分依据、文章修改意见、满分范文。\n",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 22
+ },
+ {
+ "category": "roleplay",
+ "instruction": "我想让你充当 Linux 终端。我将输入命令,您将回复终端应显示的内容。我希望您只在一个唯一的代码块内回复终端输出,而不是其他任何内容。不要写解释。除非我指示您这样做,否则不要键入命令。当我需要用英语告诉你一些事情时,我会把文字放在中括号内[就像这样]。我的第一个命令是 pwd\n",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 23
+ },
+ {
+ "category": "roleplay",
+ "instruction": "我希望你充当宠物行为主义者。我将为您提供一只宠物和它们的主人,您的目标是帮助主人了解为什么他们的宠物表现出某些行为,并提出帮助宠物做出相应调整的策略。您应该利用您的动物心理学知识和行为矫正技术来制定一个有效的计划,双方的主人都可以遵循,以取得积极的成果。我的第一个请求是“我有一只好斗的德国牧羊犬,它需要帮助来控制它的攻击性。”\n",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 24
+ },
+ {
+ "category": "roleplay",
+ "instruction": "我希望你充当正则表达式生成器。您的角色是生成匹配文本中特定模式的正则表达式。您应该以一种可以轻松复制并粘贴到支持正则表达式的文本编辑器或编程语言中的格式提供正则表达式。不要写正则表达式如何工作的解释或例子;只需提供正则表达式本身。我的第一个提示是生成一个匹配电子邮件地址的正则表达式。\n",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 25
+ }
+]
diff --git a/applications/ColossalEval/configs/gpt_evaluation/data/eval_en_examples.json b/applications/ColossalEval/configs/gpt_evaluation/data/eval_en_examples.json
new file mode 100644
index 000000000000..27b8af8bc4c6
--- /dev/null
+++ b/applications/ColossalEval/configs/gpt_evaluation/data/eval_en_examples.json
@@ -0,0 +1,202 @@
+[
+ {
+ "category": "brainstorming",
+ "instruction": "Which are some popular fiction books that I should read?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 1
+ },
+ {
+ "category": "brainstorming",
+ "instruction": "How do I properly store fruits and vegetables to keep them fresh for longer?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 2
+ },
+ {
+ "category": "brainstorming",
+ "instruction": "How do you properly chop an onion without crying?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 3
+ },
+ {
+ "category": "brainstorming",
+ "instruction": "How to make an international transfer? Please provide 3 techniques.",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 4
+ },
+ {
+ "category": "brainstorming",
+ "instruction": "Name five leadership qualities that you consider most important.",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 5
+ },
+ {
+ "category": "chat",
+ "instruction": "Complete a dialogue based on the following character information. Alex: A novice writer who is struggling to find inspiration and develop his writing skills. Emma: A successful author with many published works, providing guidance and advice to Alex.",
+ "input": "Alex: Hi Emma, I have been writing for a while now but can't seem to make any progress. Can you give me any advice? Emma: Hi Alex, sure. What kind of writing are you doing? Alex: I'm trying to write a novel, but I just can't seem to find any inspiration. Emma: ",
+ "output": "",
+ "target": "",
+ "id": 6
+ },
+ {
+ "category": "chat",
+ "instruction": "Complete a dialogue based on the following character information. John: An experienced software engineer with a passion for coding. Karen: A recent college graduate who is interested in learning more about software development.",
+ "input": "Karen: Hi John, I noticed that you have a lot of experience in the software industry. Can you tell me what you think is the most important skill for a software engineer? John: ",
+ "output": "",
+ "target": "",
+ "id": 7
+ },
+ {
+ "category": "chat",
+ "instruction": "Complete a dialogue based on the following character information. Sarah is a new employee who is nervous about her first presentation; Tom is her boss who has given her coaching and preparation materials.",
+ "input": "Sarah: Tom, I'm feeling really nervous about my presentation tomorrow. Tom: I know how you feel, Sarah. However, I believe in you and your abilities. Just stick to the preparation materials that I have given you, and you'll do great. Sarah: Thank you, Tom. What if I forget something important during the presentation? Tom: ",
+ "output": "",
+ "target": "",
+ "id": 8
+ },
+ {
+ "category": "chat",
+ "instruction": "Complete a dialogue based on the following character information. Sarah: a young artist who is full of creative ideas and always eager to try new things. Jack: a seasoned artist who has achieved great success in the art world and is more traditional in his approach to art.",
+ "input": "Sarah: Hi Jack, I'm really excited to meet you. I'm a big fan of your work. Jack: Hi Sarah, nice to meet you too. So, what kind of art do you do? Sarah: I am passionate about abstract art, especially combining different materials and colors. I think it can really give people a new perspective on things. Jack: That's interesting, but I am more focused on realistic paintings. I believe the most important thing is to master the basic skills first. Sarah: ",
+ "output": "",
+ "target": "",
+ "id": 9
+ },
+ {
+ "category": "chat",
+ "instruction": "Complete a conversation based on the following persona information. Sarah is a college student who is interested in joining a volunteer organization. John is the leader of the volunteer organization and is eager to welcome new members.",
+ "input": "Sarah: Hi, I'm Sarah, and I'm interested in joining your volunteer organization. John: Hi Sarah, welcome! We're always looking for new members who are passionate about volunteering. What areas would you like to focus on? Sarah: I'm interested in community outreach and working with children. John: ",
+ "output": "",
+ "target": "",
+ "id": 10
+ },
+ {
+ "category": "generation",
+ "instruction": "Write an email based on the subject:",
+ "input": "Subject: \"Invitation to an Exclusive Webinar\"",
+ "output": "",
+ "target": "",
+ "id": 11
+ },
+ {
+ "category": "generation",
+ "instruction": "Write a set of guidelines for first-time pet owners on how to properly care for a new puppy.",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 12
+ },
+ {
+ "category": "generation",
+ "instruction": "Can you help me write a persuasive speech on why we should recycle more and take better care of the environment?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 13
+ },
+ {
+ "category": "generation",
+ "instruction": "Write a pitch for a brand-new mobile app that helps people organize their daily tasks more efficiently.",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 14
+ },
+ {
+ "category": "generation",
+ "instruction": "Write a social media post promoting an upcoming concert featuring a local band.",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 15
+ },
+ {
+ "category": "open_qa",
+ "instruction": "Describe the significance of the Renaissance period in European history.",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 16
+ },
+ {
+ "category": "open_qa",
+ "instruction": "What is the term for the surgical removal of the appendix?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 17
+ },
+ {
+ "category": "open_qa",
+ "instruction": "Explain the process of osmosis in biological systems.",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 18
+ },
+ {
+ "category": "open_qa",
+ "instruction": "Who were the members of the Beatles band?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 19
+ },
+ {
+ "category": "open_qa",
+ "instruction": "Who painted the The Scream?",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 20
+ },
+ {
+ "category": "roleplay",
+ "instruction": "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is pwd",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 21
+ },
+ {
+ "category": "roleplay",
+ "instruction": "I want you to act as a travel guide. I will write you my location and you will suggest a place to visit near my location. In some cases, I will also give you the type of places I will visit. You will also suggest me places of similar type that are close to my first location. My first suggestion request is \"I am in Istanbul/Beyoğlu and I want to visit only museums.\"",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 22
+ },
+ {
+ "category": "roleplay",
+ "instruction": "I want you to act as an advertiser. You will create a campaign to promote a product or service of your choice. You will choose a target audience, develop key messages and slogans, select the media channels for promotion, and decide on any additional activities needed to reach your goals. My first suggestion request is \"I need help creating an advertising campaign for a new type of energy drink targeting young adults aged 18-30.\"",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 23
+ },
+ {
+ "category": "roleplay",
+ "instruction": "I want you to act as a storyteller. You will come up with entertaining stories that are engaging, imaginative and captivating for the audience. It can be fairy tales, educational stories or any other type of stories which has the potential to capture people's attention and imagination. Depending on the target audience, you may choose specific themes or topics for your storytelling session e.g., if it’s children then you can talk about animals; If it’s adults then history-based tales might engage them better etc. My first request is \"I need an interesting story on perseverance.\"",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 24
+ },
+ {
+ "category": "roleplay",
+ "instruction": "I want you to act as a rapper. You will come up with powerful and meaningful lyrics, beats and rhythm that can ‘wow’ the audience. Your lyrics should have an intriguing meaning and message which people can relate too. When it comes to choosing your beat, make sure it is catchy yet relevant to your words, so that when combined they make an explosion of sound everytime! My first request is \"I need a rap song about finding strength within yourself.\"",
+ "input": "",
+ "output": "",
+ "target": "",
+ "id": 25
+ }
+]
diff --git a/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json b/applications/ColossalEval/configs/gpt_evaluation/prompt/battle_prompt/battle_prompt_cn.json
similarity index 100%
rename from applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json
rename to applications/ColossalEval/configs/gpt_evaluation/prompt/battle_prompt/battle_prompt_cn.json
diff --git a/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json b/applications/ColossalEval/configs/gpt_evaluation/prompt/battle_prompt/battle_prompt_en.json
similarity index 100%
rename from applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json
rename to applications/ColossalEval/configs/gpt_evaluation/prompt/battle_prompt/battle_prompt_en.json
diff --git a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json b/applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_cn.json
similarity index 56%
rename from applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
rename to applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_cn.json
index dccab2417eee..70f6c3ebc316 100644
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
+++ b/applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_cn.json
@@ -39,53 +39,8 @@
},
"prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
- "classification": {
- "id": 3,
- "category": "classification",
- "metrics": {
- "language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
- "relevance": "切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。",
- "correctness": "正确性(1-5):答案是否正确。"
- },
- "CoT": {
- "language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。\n\n语言组织:",
- "relevance": "1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。\n2. 阅读答案,确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度,并给出一个1到5的分数,其中5表示答案非常切题,而1表示答案完全没有切题。\n\n切题:",
- "correctness": "1. 仔细阅读题目,尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的,则可以将正确性得分为5分。如果答案是部分正确的,则可以给予适当的得分,例如2分、3分或4分。如果答案完全不正确,则只得1分。\n\n正确性:"
- },
- "prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
- },
- "closed_qa": {
- "id": 4,
- "category": "closed_qa",
- "metrics": {
- "language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
- "relevance": "切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。",
- "correctness": "正确性(1-5):答案是否正确。"
- },
- "CoT": {
- "language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。\n\n语言组织:",
- "relevance": "1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。\n2. 阅读答案,确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度,并给出一个1到5的分数,其中5表示答案非常切题,而1表示答案完全没有切题。\n\n切题:",
- "correctness": "1. 仔细阅读题目,尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的,则可以将正确性得分为5分。如果答案是部分正确的,则可以给予适当的得分,例如2分、3分或4分。如果答案完全不正确,则只得1分。\n\n正确性:"
- },
- "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下:\n\n{question}\n\n需要你评分的答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
- },
- "extraction": {
- "id": 5,
- "category": "extraction",
- "metrics": {
- "language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
- "relevance": "切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。",
- "correctness": "准确性(1-5):回答应该准确无误地提取出所需信息,不应该包含任何错误或误导性信息。"
- },
- "CoT": {
- "language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。\n\n语言组织:",
- "relevance": "1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。\n2. 阅读答案,确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度,并给出一个1到5的分数,其中5表示答案非常切题,而1表示答案完全没有切题。\n\n切题:",
- "correctness": "1. 仔细阅读问题并确定需要从材料中提取的信息。\n2. 仔细阅读回答并确保它涵盖了所有需要提取的信息。\n3. 使用所提供的材料来验证回答的准确性。如果回答不准确或包含错误或误导性信息,则无法给出高分。\n4. 检查回答是否包含所有要求提取的信息,不要漏掉任何重要细节。\n5. 根据回答的准确性和完整性,给出一个介于1和5之间的分数,5分表示回答非常准确且完整,1分表示回答几乎没有提取出所需信息。\n\n准确性:"
- },
- "prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
- },
"generation": {
- "id": 6,
+ "id": 3,
"category": "generation",
"metrics": {
"language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
@@ -100,7 +55,7 @@
"prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
"open_qa": {
- "id": 7,
+ "id": 4,
"category": "open_qa",
"metrics": {
"language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
@@ -114,23 +69,8 @@
},
"prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
- "rewriting": {
- "id": 8,
- "category": "rewriting",
- "metrics": {
- "language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
- "relevance": "切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。",
- "correctness": "正确性(1-5):答案是否正确。"
- },
- "CoT": {
- "language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。\n\n语言组织:",
- "relevance": "1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。\n2. 阅读答案,确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度,并给出一个1到5的分数,其中5表示答案非常切题,而1表示答案完全没有切题。\n\n切题:",
- "correctness": "1. 仔细阅读题目,尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的,则可以将正确性得分为5分。如果答案是部分正确的,则可以给予适当的得分,例如2分、3分或4分。如果答案完全不正确,则只得1分。\n\n正确性:"
- },
- "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
- },
"roleplay": {
- "id": 9,
+ "id": 5,
"category": "roleplay",
"metrics": {
"language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
@@ -146,33 +86,14 @@
},
"prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
- "summarization": {
- "id": 10,
- "category": "summarization",
- "metrics": {
- "language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
- "relevance": "切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。",
- "correctness": "准确性(1-5):回答应该准确无误地总结出材料的重点。",
- "conciseness": "简明扼要(1-5):答案是否简明扼要,没有冗余内容。"
- },
- "CoT": {
- "language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。\n\n语言组织:",
- "relevance": "1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。\n2. 阅读答案,确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度,并给出一个1到5的分数,其中5表示答案非常切题,而1表示答案完全没有切题。\n\n切题:",
- "correctness": "1. 仔细阅读问题给的材料,理解其内容和要点。\n2. 评估回答是否准确地总结出原始材料的重点。\n3. 评估回答是否包含原始材料中的所有关键信息。\n4. 根据以上步骤,给出一个1-5的分数,其中1表示回答不能准确地总结出材料的重点,5表示回答完全准确地总结出材料的重点。\n\n准确性:",
- "conciseness": "1. 阅读题目,提取出材料的重点。\n2. 阅读该总结,并注意其中的主要观点和信息。\n3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息,而不是冗长的段落或文章。\n4. 检查总结是否包含与主要观点无关的信息或冗余信息。\n5.确定总结涵盖了材料中的关键信息,并且没有忽略任何重要细节。\n6.给总结打出1-5的分数,其中5表示总结简明扼要,没有冗余内容,而1表示总结冗长或包含不必要的信息,难以理解或记忆。根据您的判断,打出适当的得分。\n\n简明扼要:"
- },
- "prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
- },
- "general": {
- "id": 11,
- "category": "general",
+ "Other": {
+ "id": 6,
+ "category": "Other",
"metrics": {
- "language organization": "语言组织(1-5):答案语言是否流畅、连贯,使用正确的语法,具有一定逻辑性,使用恰当的连接词、过渡词等等。",
"relevance": "切题(1-5):答案内容是否切题,不答非所问,并且严格遵照题目要求。",
"correctness": "正确性(1-5):答案是否正确。"
},
"CoT": {
- "language organization": "1. 阅读答案,并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性,能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关,并且能够传达清晰的信息。\n4. 检查答案是否连贯,是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式,使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织,并给出一个1到5的分数,其中5表示语言组织非常好,而1表示语言组织非常差。\n\n语言组织:",
"relevance": "1. 阅读题目,确定题目所问的问题是什么,以及需要回答哪些方面的问题。\n2. 阅读答案,确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求,包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度,并给出一个1到5的分数,其中5表示答案非常切题,而1表示答案完全没有切题。\n\n切题:",
"correctness": "1. 仔细阅读题目,尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的,则可以将正确性得分为5分。如果答案是部分正确的,则可以给予适当的得分,例如2分、3分或4分。如果答案完全不正确,则只得1分。\n\n正确性:"
},
diff --git a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json b/applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_en.json
similarity index 59%
rename from applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
rename to applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_en.json
index 8355b0c27b79..3d04387d98c5 100644
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
+++ b/applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_en.json
@@ -39,53 +39,8 @@
},
"prompt": "You are a good assistant. Please rate the given answer to the \"chat\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
- "classification": {
- "id": 3,
- "category": "classification",
- "metrics": {
- "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
- "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
- "correctness": "Correctness (1-5): whether the answer is correct or not."
- },
- "CoT": {
- "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
- "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
- "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
- },
- "prompt": "You are a good assistant. Please rate the given answer to the \"classification\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
- },
- "closed_qa": {
- "id": 4,
- "category": "closed_qa",
- "metrics": {
- "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
- "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
- "correctness": "Correctness (1-5): whether the answer is correct or not."
- },
- "CoT": {
- "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
- "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
- "correctness": "1. Read the question carefully and try to answer the question by yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
- },
- "prompt": "You are a good assistant. Please rate the given answer to the \"closed qa\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
- },
- "extraction": {
- "id": 5,
- "category": "extraction",
- "metrics": {
- "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
- "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
- "correctness": "correctness (1-5): Answers should extract the required information accurately and should not contain any incorrect or misleading information."
- },
- "CoT": {
- "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
- "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
- "correctness": "1. Read the questions carefully and identify the information that needs to be extracted from the material.\n2. Read the answer carefully and make sure it covers all the information that needs to be extracted.\n3. Use the material provided to verify the correctness of the response. If the response is inaccurate or contains incorrect or misleading information, a high score cannot be given.\n4. Check that the answer contains all the information required to be extracted and do not leave out any important details.\n5. Give a score between 1 and 5 based on the correctness and completeness of the response, with a score of 5 indicating a very accurate and complete response and a score of 1 indicating that the response barely extracts the required information.\n\nCorrectness:"
- },
- "prompt": "You are a good assistant. Please rate the given answer to the \"extraction\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
- },
"generation": {
- "id": 6,
+ "id": 3,
"category": "generation",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
@@ -100,7 +55,7 @@
"prompt": "You are a good assistant. Please rate the given answer to the \"generation\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
"open_qa": {
- "id": 7,
+ "id": 4,
"category": "open_qa",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
@@ -114,23 +69,8 @@
},
"prompt": "You are a good assistant. Please rate the answers to the \"open qa\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
- "rewriting": {
- "id": 8,
- "category": "rewriting",
- "metrics": {
- "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
- "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
- "correctness": "Correctness (1-5): whether the answer is correct or not."
- },
- "CoT": {
- "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
- "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
- "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
- },
- "prompt": "You are a good assistant. Please rate the answers to the \"rewriting\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
- },
"roleplay": {
- "id": 9,
+ "id": 5,
"category": "roleplay",
"metrics": {
"language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
@@ -146,35 +86,17 @@
},
"prompt": "You are a good assistant. Please rate the given answer to the \"role-play\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
},
- "summarization": {
- "id": 10,
- "category": "summarization",
- "metrics": {
- "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
- "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
- "correctness": "Correctness (1-5): answers should summarize the main points of the material accurately and unambiguously.",
- "conciseness": "Conciseness (1-5): answers should be concise and without redundant content."
- },
- "CoT": {
- "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
- "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
- "correctness": "1. Read the material given in the question carefully to understand its content and main points.\n2. Assess whether the answer accurately summarizes the key points of the source material.\n3. assess whether the response contains all the key information in the source material.\n4. Based on the above steps, give a score of 1-5, where 1 means that the response does not accurately summarize the main points of the material and 5 means that the response completely accurately summarizes the main points of the material.\n\nCorrectness:",
- "conciseness": "1. Read the title and extract the main points of the material.\n2. Read the summary and note the main ideas and messages in it.\n3. Assess the length of the summary. A concise summary should usually convey key information within a few sentences or paragraphs, rather than lengthy paragraphs or essays.\n4. Check that the summary does not contain information that is not relevant to the main ideas or that is redundant.\n5. Make sure that the summary covers the key information in the material and that no important details have been omitted.\n6. Rate the summary on a scale of 1-5, where 5 means the summary is concise and free of redundancy, and 1 means the summary is lengthy or contains unnecessary information that is difficult to understand or remember. Based on your judgment, assign the appropriate score.\n\nConciseness:"
- },
- "prompt": "You are a good assistant. Please rate the given answer to the \"summarization\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
- },
- "general": {
- "id": 11,
- "category": "general",
+ "Other": {
+ "id": 6,
+ "category": "Other",
"metrics": {
- "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
"relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
"correctness": "Correctness (1-5): whether the answer is correct or not."
},
"CoT": {
"language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
"relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
- "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
+ "correctness": "1. Read the question carefully and try to answer the question by yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
},
"prompt": "You are a good assistant. Please rate the given answer to the question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
}
diff --git a/applications/ColossalEval/examples/dataset_evaluation/config/evaluation/config.json b/applications/ColossalEval/examples/dataset_evaluation/config/evaluation/config.json
new file mode 100644
index 000000000000..adb540f60345
--- /dev/null
+++ b/applications/ColossalEval/examples/dataset_evaluation/config/evaluation/config.json
@@ -0,0 +1,58 @@
+{
+ "model": [
+ {
+ "name": "model1"
+ },
+ {
+ "name": "model2"
+ }
+ ],
+ "dataset": [
+ {
+ "name": "mmlu",
+ "metrics": [
+ "first_token_accuracy",
+ "single_choice_accuracy",
+ "perplexity",
+ "ppl_score",
+ "ppl_score_over_choices"
+ ]
+ },
+ {
+ "name": "cmmlu",
+ "metrics": [
+ "first_token_accuracy",
+ "single_choice_accuracy",
+ "perplexity",
+ "ppl_score",
+ "ppl_score_over_choices"
+ ]
+ },
+ {
+ "name": "agieval",
+ "metrics": [
+ "first_token_accuracy",
+ "single_choice_accuracy",
+ "multi_choice_accuracy",
+ "math_equivalence",
+ "perplexity",
+ "ppl_score_over_choices",
+ "ppl_score"
+ ]
+ },
+ {
+ "name": "gaokaobench",
+ "metrics": [
+ "first_token_accuracy",
+ "single_choice_accuracy",
+ "multi_choice_accuracy",
+ "math_equivalence",
+ "rouge_score",
+ "rouge_zh_score",
+ "perplexity",
+ "ppl_score_over_choices",
+ "ppl_score"
+ ]
+ }
+ ]
+}
diff --git a/applications/ColossalEval/examples/dataset_evaluation/config/inference/config.json b/applications/ColossalEval/examples/dataset_evaluation/config/inference/config.json
new file mode 100644
index 000000000000..9672c442e647
--- /dev/null
+++ b/applications/ColossalEval/examples/dataset_evaluation/config/inference/config.json
@@ -0,0 +1,84 @@
+{
+ "model": [
+ {
+ "name": "model name",
+ "model_class": "HuggingFaceCausalLM",
+ "parameters": {
+ "path": "path to model",
+ "model_max_length": 4096,
+ "tokenizer_path": "",
+ "tokenizer_kwargs": {
+ "trust_remote_code": true
+ },
+ "peft_path": null,
+ "model_kwargs": {
+ "torch_dtype": "torch.float32",
+ "trust_remote_code": true
+ },
+ "prompt_template": "plain",
+ "batch_size": 4
+ }
+ },
+ {
+ "name": "model2 name",
+ "model_class": "HuggingFaceCausalLM",
+ "parameters": {
+ "path": "path to model2",
+ "model_max_length": 4096,
+ "tokenizer_path": "",
+ "tokenizer_kwargs": {
+ "trust_remote_code": true
+ },
+ "peft_path": null,
+ "model_kwargs": {
+ "torch_dtype": "torch.float32",
+ "trust_remote_code": true
+ },
+ "prompt_template": "plain",
+ "batch_size": 4
+ }
+ }
+ ],
+ "dataset": [
+ {
+ "name": "agieval",
+ "dataset_class": "AGIEvalDataset",
+ "debug": false,
+ "few_shot": false,
+ "path": "path to original dataset (folder)",
+ "save_path": "path to save converted dataset (e.g. inference_data/agieval.json)"
+ },
+ {
+ "name": "ceval",
+ "dataset_class": "CEvalDataset",
+ "debug": false,
+ "few_shot": true,
+ "path": "path to original dataset (folder)",
+ "save_path": "path to save converted dataset (e.g. inference_data/ceval.json)"
+ },
+ {
+ "name": "cmmlu",
+ "dataset_class": "CMMLUDataset",
+ "debug": false,
+ "few_shot": true,
+ "path": "path to original dataset (folder)",
+ "save_path": "path to save converted dataset (e.g. inference_data/cmmlu.json)"
+ },
+ {
+ "name": "gaokaobench",
+ "dataset_class": "GaoKaoBenchDataset",
+ "debug": false,
+ "few_shot": false,
+ "path": "path to original dataset (folder)",
+ "save_path": "path to save converted dataset (e.g. inference_data/gaokaobench.json)"
+ },
+ {
+ "name": "mmlu",
+ "dataset_class": "MMLUDataset",
+ "debug": false,
+ "few_shot": true,
+ "path": "path to original dataset (folder)",
+ "save_path": "path to save converted dataset (e.g. inference_data/mmlu.json)"
+ }
+ ]
+}
diff --git a/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.py b/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.py
new file mode 100644
index 000000000000..5724c6e40693
--- /dev/null
+++ b/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.py
@@ -0,0 +1,75 @@
+import argparse
+import os
+
+import tabulate
+from colossal_eval.evaluate.dataset_evaluator import DatasetEvaluator
+from colossal_eval.utils import jdump, jload
+
+
+def main(args):
+ config = jload(args.config)
+
+ evaluation_results = {dataset["name"]: {} for dataset in config["dataset"]}
+ evaluation_results_table = {dataset["name"]: {} for dataset in config["dataset"]}
+ evaluator = DatasetEvaluator(args.config, args.evaluation_results_save_path)
+
+ for dataset_parameter in config["dataset"]:
+ dataset_name = dataset_parameter["name"]
+ metrics = dataset_parameter["metrics"]
+ results_metric_model = {metric: {model["name"]: None for model in config["model"]} for metric in metrics}
+ for model in config["model"]:
+ model_name = model["name"]
+
+ data = jload(
+ os.path.join(args.inference_results_path, model_name, f"{dataset_name}_inference_results.json")
+ )
+ results = evaluator.get_evaluation_results(data, dataset_name, model_name, metrics)
+
+ for metric, score in results.items():
+ if metric not in results_metric_model:
+ results_metric_model[metric] = {model["name"]: None for model in config["model"]}
+ results_metric_model[metric][model_name] = score["ALL"]
+
+ evaluation_results[dataset_name][model_name] = results
+
+ evaluation_results_table[dataset_name] = results_metric_model
+
+ table = []
+ header = ["dataset", "metric"] + [model["name"] for model in config["model"]]
+ table.append(header)
+
+ for dataset_parameter in config["dataset"]:
+ dataset_name = dataset_parameter["name"]
+ metrics = dataset_parameter["metrics"]
+
+ for metric, model_results in evaluation_results_table[dataset_name].items():
+ row = [dataset_name]
+ for model, score in model_results.items():
+ if len(row) == 1:
+ row.extend([metric, "{:.02f}".format(score)])
+ else:
+ row.append("{:.02f}".format(score))
+
+ table.append(row)
+
+ table = tabulate.tabulate(table, headers="firstrow")
+ print(table)
+
+ os.makedirs(args.evaluation_results_save_path, exist_ok=True)
+
+ with open(os.path.join(args.evaluation_results_save_path, "evaluation_results_table.txt"), "w") as file:
+ file.write(table)
+
+ jdump(evaluation_results, os.path.join(args.evaluation_results_save_path, "evaluation_results.json"))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="ColossalEval evaluation process.")
+ parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
+ parser.add_argument("--inference_results_path", type=str, default=None, help="path to inference results")
+ parser.add_argument(
+ "--evaluation_results_save_path", type=str, default=None, help="path to save evaluation results"
+ )
+ args = parser.parse_args()
+
+ main(args)
diff --git a/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.sh b/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.sh
new file mode 100644
index 000000000000..ad0bfc03acbb
--- /dev/null
+++ b/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.sh
@@ -0,0 +1,4 @@
+python eval_dataset.py \
+ --config "path to config file" \
+ --inference_results_path "path to inference results" \
+ --evaluation_results_save_path "path to save evaluation results"
diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.py b/applications/ColossalEval/examples/dataset_evaluation/inference.py
new file mode 100644
index 000000000000..b3579424ae1c
--- /dev/null
+++ b/applications/ColossalEval/examples/dataset_evaluation/inference.py
@@ -0,0 +1,187 @@
+import argparse
+import copy
+import os
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+from colossal_eval import dataset, models, utils
+
+import colossalai
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def rm_and_merge(world_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
+ """
+ Remove inference result per rank and merge them into one file.
+
+ Args:
+ world_size: Number of processes for inference.
+ save_path: The folder for storing inference results.
+ model_names: Names of models for inference.
+ dataset_names: Names of dataset for inference.
+
+ """
+
+ for model_name in model_names:
+ for dataset_name, categories in dataset_names.items():
+ all_answers = {}
+ for category in categories:
+ all_answers[category] = {"data": []}
+ answers = {"data": []}
+
+ for r in range(world_size):
+ directory = os.path.join(
+ save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
+ )
+ if not os.path.exists(directory):
+ raise Exception(
+ f"Directory {directory} not found. There may be an error during inference time."
+ )
+ else:
+ rank_answers = utils.jload(directory)
+ answers["data"].extend(rank_answers["data"])
+ answers["inference_kwargs"] = rank_answers["inference_kwargs"]
+
+ for r in range(world_size):
+ try:
+ directory = os.path.join(
+ save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
+ )
+ os.remove(directory)
+ except Exception as e:
+ print(e)
+
+ all_answers[category] = answers
+
+ logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
+ utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
+
+ logger.info(f"Save inference results of model {model_name} for all dataset.")
+ logger.info(f"Save inference results of all models for all dataset.")
+
+
+def main(args):
+ colossalai.launch_from_torch(config={}, seed=42)
+ world_size = dist.get_world_size()
+ rank = dist.get_rank()
+
+ inference_data = {}
+ debug_args = {}
+ few_shot_args = {}
+ multiturn_args = {}
+
+ config = utils.jload(args.config)
+
+ model_parameters = config["model"]
+ dataset_parameters = config["dataset"]
+
+ for dataset_parameter in dataset_parameters:
+ path = dataset_parameter["path"]
+ save_path = dataset_parameter["save_path"]
+ dataset_name = dataset_parameter["name"]
+ debug_args[dataset_name] = dataset_parameter["debug"]
+ few_shot_args[dataset_name] = dataset_parameter["few_shot"]
+
+ if not args.load_dataset:
+ if os.path.exists(save_path):
+ dataset_ = utils.jload(save_path)
+ inference_data[dataset_name] = dataset_["test"]
+ else:
+ raise Exception(
+ "Can't find the converted dataset. You may set load_dataset True to store the dataset first."
+ )
+
+ continue
+
+ dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
+ if not issubclass(dataset_class, dataset.BaseDataset):
+ raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
+
+ dataset_ = dataset_class(path, logger, dataset_parameter["few_shot"])
+
+ dataset_.save(save_path)
+
+ if hasattr(dataset_, "multiturn") and dataset_.multiturn:
+ multiturn_args[dataset_name] = True
+ logger.info(f"{dataset_parameter['dataset_class']} is a multiturn dataset.")
+ else:
+ multiturn_args[dataset_name] = False
+
+ inference_data[dataset_name] = dataset_.dataset["test"]
+
+ for model_parameter in model_parameters:
+ model_name = model_parameter["name"]
+ model_class = eval(f"models.{model_parameter['model_class']}")
+ paramerters = model_parameter["parameters"]
+ paramerters.update({"logger": logger})
+ paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
+
+ model_ = model_class(**paramerters)
+ if not issubclass(model_class, models.BaseModel):
+ raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
+
+ for dataset_name, split_data in inference_data.items():
+ start = 0
+ prev_questions = None
+ for category, category_data in split_data.items():
+ num_turn = category_data["inference_kwargs"].get("turns", 1)
+
+ if few_shot_args[dataset_name] and category_data["inference_kwargs"].get("few_shot_data", None) is None:
+ raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
+
+ answers_to_dump = copy.deepcopy(category_data)
+ partition_size = len(category_data["data"]) // world_size
+ redundant = len(category_data["data"]) % world_size
+
+ # Ensure that the amount of data for inference is as consistent as possible across different processes.
+ lengths = [partition_size for _ in range(world_size)]
+ for j in range(redundant):
+ lengths[(j + start) % world_size] += 1
+
+ start = (start + redundant) % world_size
+
+ for turn in range(num_turn):
+ if turn == 0:
+ questions = category_data["data"][sum(lengths[0:rank]) : sum(lengths[0:rank]) + lengths[rank]]
+ else:
+ questions = prev_questions
+
+ answers_per_rank = model_.inference(
+ questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
+ )
+ prev_questions = answers_per_rank
+
+ answers_to_dump["data"] = answers_per_rank
+
+ utils.jdump(
+ answers_to_dump,
+ os.path.join(
+ args.inference_save_path,
+ model_name,
+ f"{dataset_name}_{category}_inference_results_rank{rank}.json",
+ ),
+ )
+
+ logger.info(f"Rank {rank} peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
+
+ del model_
+ torch.cuda.empty_cache()
+
+ dist.barrier()
+ if rank == 0:
+ model_names = [model_parameter["name"] for model_parameter in model_parameters]
+ dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
+ rm_and_merge(world_size, args.inference_save_path, model_names, dataset_names)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="ColossalEval inference process.")
+ parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
+ parser.add_argument("--load_dataset", default=False, action="store_true")
+ parser.add_argument("--inference_save_path", type=str, default=None, help="path to save inference results")
+ args = parser.parse_args()
+
+ main(args)
diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.sh b/applications/ColossalEval/examples/dataset_evaluation/inference.sh
new file mode 100644
index 000000000000..15f9afd56045
--- /dev/null
+++ b/applications/ColossalEval/examples/dataset_evaluation/inference.sh
@@ -0,0 +1,4 @@
+torchrun --nproc_per_node=1 inference.py \
+ --config "path to config file" \
+ --load_dataset \
+ --inference_save_path "path to save inference results"
diff --git a/applications/ColossalEval/examples/gpt_evaluation/config/evaluation/config.json b/applications/ColossalEval/examples/gpt_evaluation/config/evaluation/config.json
new file mode 100644
index 000000000000..6ebe3996b1cf
--- /dev/null
+++ b/applications/ColossalEval/examples/gpt_evaluation/config/evaluation/config.json
@@ -0,0 +1,44 @@
+{
+ "language": "en",
+ "category": {
+ "brainstorming": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "creativity",
+ "practicality",
+ "reasonableness"
+ ]
+ },
+ "chat": {
+ "GPT": [
+ "language organization",
+ "naturalness",
+ "engagingness",
+ "fidelity"
+ ]
+ },
+ "generation": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "diversity"
+ ]
+ },
+ "open_qa": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "correctness"
+ ]
+ },
+ "roleplay": {
+ "GPT": [
+ "language organization",
+ "relevance",
+ "fidelity",
+ "creativity"
+ ]
+ }
+ }
+}
diff --git a/applications/ColossalEval/examples/gpt_evaluation/config/inference/config.json b/applications/ColossalEval/examples/gpt_evaluation/config/inference/config.json
new file mode 100644
index 000000000000..7ed7491a87c5
--- /dev/null
+++ b/applications/ColossalEval/examples/gpt_evaluation/config/inference/config.json
@@ -0,0 +1,33 @@
+{
+ "model": [
+ {
+ "name": "model name",
+ "model_class": "HuggingFaceCausalLM",
+ "parameters": {
+ "path": "path to model",
+ "model_max_length": 4096,
+ "tokenizer_path": "",
+ "tokenizer_kwargs": {
+ "trust_remote_code": true
+ },
+ "peft_path": null,
+ "model_kwargs": {
+ "torch_dtype": "torch.float32",
+ "trust_remote_code": true
+ },
+ "prompt_template": "plain",
+ "batch_size": 4
+ }
+ }
+ ],
+ "dataset": [
+ {
+ "name": "colossal",
+ "dataset_class": "ColossalDataset",
+ "debug": false,
+ "few_shot": false,
+ "path": "../../configs/gpt_evaluation/data/eval_en_examples.json",
+ "save_path": "path to save converted dataset (inference_data/colossal.json)"
+ }
+ ]
+}
diff --git a/applications/ColossalEval/examples/gpt_evaluation/eval.py b/applications/ColossalEval/examples/gpt_evaluation/eval.py
new file mode 100644
index 000000000000..cd521af59823
--- /dev/null
+++ b/applications/ColossalEval/examples/gpt_evaluation/eval.py
@@ -0,0 +1,139 @@
+import argparse
+import os
+
+import openai
+from colossal_eval.evaluate.evaluator import Evaluator
+from colossal_eval.utils import jload
+
+
+def main(args):
+ assert len(args.answer_file_list) == len(
+ args.model_name_list
+ ), "The number of answer files and model names should be equal!"
+
+ # load config
+ config = jload(args.config_file)
+
+ if config["language"] in ["cn", "en"]:
+ # get metric settings for all categories
+ metrics_per_category = {}
+ for category in config["category"].keys():
+ metrics_all = {}
+ for metric_type, metrics in config["category"][category].items():
+ metrics_all[metric_type] = metrics
+ metrics_per_category[category] = metrics_all
+
+ battle_prompt = None
+ if args.battle_prompt_file:
+ battle_prompt = jload(args.battle_prompt_file)
+
+ gpt_evaluation_prompt = None
+ if args.gpt_evaluation_prompt_file:
+ gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
+
+ if len(args.model_name_list) == 2 and not battle_prompt:
+ raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
+
+ if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
+ raise Exception(
+ "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
+ )
+
+ if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
+ raise Exception(
+ "GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
+ )
+
+ # initialize evaluator
+ evaluator = Evaluator(
+ metrics_per_category,
+ battle_prompt,
+ gpt_evaluation_prompt,
+ args.gpt_model,
+ config["language"],
+ args.gpt_with_reference,
+ )
+ if len(args.model_name_list) == 2:
+ answers_1 = jload(args.answer_file_list[0])
+ answers_2 = jload(args.answer_file_list[1])
+
+ answers1 = []
+ for category, value in answers_1.items():
+ answers1.extend(value["data"])
+
+ answers2 = []
+ for category, value in answers_2.items():
+ answers2.extend(value["data"])
+
+ assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
+
+ evaluator.battle(answers1=answers1, answers2=answers2)
+ evaluator.save(args.save_path, args.model_name_list)
+ elif len(args.model_name_list) == 1:
+ targets = jload(args.target_file)
+ answers = jload(args.answer_file_list[0])
+
+ references = []
+ for category, value in targets["test"].items():
+ references.extend(value["data"])
+
+ predictions = []
+ for category, value in answers.items():
+ predictions.extend(value["data"])
+
+ assert len(references) == len(
+ predictions
+ ), "The number of target answers and model answers should be equal!"
+
+ evaluator.evaluate(
+ answers=predictions, targets=references, save_path=args.save_path, model_name=args.model_name_list[0]
+ )
+ evaluator.save(args.save_path, args.model_name_list)
+ else:
+ raise ValueError("Unsupported number of answer files and model names!")
+ else:
+ raise ValueError(f'Unsupported language {config["language"]}!')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
+ parser.add_argument(
+ "--config_file", type=str, default=None, required=True, help="path to the file of target results"
+ )
+ parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
+ parser.add_argument(
+ "--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
+ )
+ parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
+ parser.add_argument(
+ "--answer_file_list",
+ type=str,
+ nargs="+",
+ default=[],
+ required=True,
+ help="path to the answer files of at most 2 models",
+ )
+ parser.add_argument(
+ "--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
+ )
+ parser.add_argument(
+ "--gpt_model",
+ default="gpt-3.5-turbo-16k",
+ choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],
+ help="which GPT model to use for evaluation",
+ )
+ parser.add_argument(
+ "--gpt_with_reference",
+ default=False,
+ action="store_true",
+ help="whether to include reference answer in gpt evaluation",
+ )
+ parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
+ parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
+ args = parser.parse_args()
+
+ if args.openai_key is not None:
+ os.environ["OPENAI_API_KEY"] = args.openai_key
+ openai.api_key = os.getenv("OPENAI_API_KEY")
+
+ main(args)
diff --git a/applications/Chat/evaluate/eval.sh b/applications/ColossalEval/examples/gpt_evaluation/eval.sh
old mode 100755
new mode 100644
similarity index 100%
rename from applications/Chat/evaluate/eval.sh
rename to applications/ColossalEval/examples/gpt_evaluation/eval.sh
diff --git a/applications/ColossalEval/examples/gpt_evaluation/inference.py b/applications/ColossalEval/examples/gpt_evaluation/inference.py
new file mode 100644
index 000000000000..657fc33bf1ef
--- /dev/null
+++ b/applications/ColossalEval/examples/gpt_evaluation/inference.py
@@ -0,0 +1,171 @@
+import argparse
+import copy
+import os
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+from colossal_eval import dataset, models, utils
+
+import colossalai
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def rm_and_merge(world_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
+ """
+ Remove inference result per rank and merge them into one file.
+
+ Args:
+ world_size: Number of processes for inference.
+ save_path: The folder for storing inference results.
+ model_names: Names of models for inference.
+ dataset_names: Names of dataset for inference.
+
+ """
+
+ for model_name in model_names:
+ for dataset_name, categories in dataset_names.items():
+ all_answers = {}
+ for category in categories:
+ all_answers[category] = {"data": []}
+ answers = {"data": []}
+
+ for r in range(world_size):
+ directory = os.path.join(
+ save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
+ )
+ if not os.path.exists(directory):
+ raise Exception(
+ f"Directory {directory} not found. There may be an error during inference time."
+ )
+ else:
+ rank_answers = utils.jload(directory)
+ answers["data"].extend(rank_answers["data"])
+ answers["inference_kwargs"] = rank_answers["inference_kwargs"]
+
+ for r in range(world_size):
+ try:
+ directory = os.path.join(
+ save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
+ )
+ os.remove(directory)
+ except Exception as e:
+ print(e)
+
+ all_answers[category] = answers
+
+ logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
+ utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
+
+ logger.info(f"Save inference results of model {model_name} for all dataset.")
+ logger.info(f"Save inference results of all models for all dataset.")
+
+
+def main(args):
+ colossalai.launch_from_torch(config={}, seed=42)
+ world_size = dist.get_world_size()
+ rank = dist.get_rank()
+
+ inference_data = {}
+ debug_args = {}
+ few_shot_args = {}
+
+ config = utils.jload(args.config)
+
+ model_parameters = config["model"]
+ dataset_parameters = config["dataset"]
+
+ for dataset_parameter in dataset_parameters:
+ path = dataset_parameter["path"]
+ save_path = dataset_parameter["save_path"]
+ dataset_name = dataset_parameter["name"]
+ debug_args[dataset_name] = dataset_parameter["debug"]
+ few_shot_args[dataset_name] = dataset_parameter["few_shot"]
+
+ if not args.load_dataset:
+ if os.path.exists(save_path):
+ dataset_ = utils.jload(save_path)
+ inference_data[dataset_name] = dataset_["test"]
+ else:
+ raise Exception(
+ "Can't find the converted dataset. You may set load_dataset True to store the dataset first."
+ )
+
+ continue
+
+ dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
+ if not issubclass(dataset_class, dataset.BaseDataset):
+ raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
+
+ dataset_ = dataset_class(path, logger, dataset_parameter["few_shot"])
+
+ dataset_.save(save_path)
+ inference_data[dataset_name] = dataset_.dataset["test"]
+
+ for model_parameter in model_parameters:
+ model_name = model_parameter["name"]
+ model_class = eval(f"models.{model_parameter['model_class']}")
+ paramerters = model_parameter["parameters"]
+ paramerters.update({"logger": logger})
+ paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
+
+ model_ = model_class(**paramerters)
+ if not issubclass(model_class, models.BaseModel):
+ raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
+
+ for dataset_name, split_data in inference_data.items():
+ start = 0
+ for category, category_data in split_data.items():
+ if few_shot_args[dataset_name] and category_data["inference_kwargs"].get("few_shot_data", None) is None:
+ raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
+
+ answers_to_dump = copy.deepcopy(category_data)
+ partition_size = len(category_data["data"]) // world_size
+ redundant = len(category_data["data"]) % world_size
+
+ # Ensure that the amount of data for inference is as consistent as possible across different processes.
+ lengths = [partition_size for _ in range(world_size)]
+ for j in range(redundant):
+ lengths[(j + start) % world_size] += 1
+
+ start = (start + redundant) % world_size
+
+ questions = category_data["data"][sum(lengths[0:rank]) : sum(lengths[0:rank]) + lengths[rank]]
+
+ answers_per_rank = model_.inference(
+ questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
+ )
+
+ answers_to_dump["data"] = answers_per_rank
+
+ utils.jdump(
+ answers_to_dump,
+ os.path.join(
+ args.inference_save_path,
+ model_name,
+ f"{dataset_name}_{category}_inference_results_rank{rank}.json",
+ ),
+ )
+
+ logger.info(f"Rank {rank} peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
+
+ del model_
+ torch.cuda.empty_cache()
+
+ dist.barrier()
+ if rank == 0:
+ model_names = [model_parameter["name"] for model_parameter in model_parameters]
+ dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
+ rm_and_merge(world_size, args.inference_save_path, model_names, dataset_names)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="ColossalEval inference process.")
+ parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
+ parser.add_argument("--load_dataset", default=False, action="store_true")
+ parser.add_argument("--inference_save_path", type=str, default=None, help="path to save inference results")
+ args = parser.parse_args()
+
+ main(args)
diff --git a/applications/ColossalEval/examples/gpt_evaluation/inference.sh b/applications/ColossalEval/examples/gpt_evaluation/inference.sh
new file mode 100644
index 000000000000..15f9afd56045
--- /dev/null
+++ b/applications/ColossalEval/examples/gpt_evaluation/inference.sh
@@ -0,0 +1,4 @@
+torchrun --nproc_per_node=1 inference.py \
+ --config "path to config file" \
+ --load_dataset \
+ --inference_save_path "path to save inference results"
diff --git a/applications/ColossalEval/requirements.txt b/applications/ColossalEval/requirements.txt
new file mode 100644
index 000000000000..c110606e0303
--- /dev/null
+++ b/applications/ColossalEval/requirements.txt
@@ -0,0 +1,12 @@
+transformers>=4.32.0
+colossalai>=0.3.1
+peft
+tabulate
+jieba
+fuzzywuzzy
+rouge
+openai
+matplotlib
+pandas
+seaborn
+scikit-learn
diff --git a/applications/ColossalEval/setup.py b/applications/ColossalEval/setup.py
new file mode 100644
index 000000000000..4f7b1bb5c42e
--- /dev/null
+++ b/applications/ColossalEval/setup.py
@@ -0,0 +1,31 @@
+from setuptools import find_packages, setup
+
+
+def fetch_requirements(path):
+ with open(path, "r") as fd:
+ return [r.strip() for r in fd.readlines()]
+
+
+def fetch_readme():
+ with open("README.md", encoding="utf-8") as f:
+ return f.read()
+
+
+setup(
+ name="colossal_eval",
+ version="0.0.1",
+ packages=find_packages(exclude=["examples", "*.egg-info"]),
+ description="Colossal-AI LLM-Evaluation Framework",
+ long_description=fetch_readme(),
+ long_description_content_type="text/markdown",
+ license="Apache Software License 2.0",
+ url="https://github.com/hpcaitech/LLM-Evaluation",
+ install_requires=fetch_requirements("requirements.txt"),
+ python_requires=">=3.6",
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: Apache Software License",
+ "Environment :: GPU :: NVIDIA CUDA",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ ],
+)
diff --git a/applications/ColossalQA/.gitignore b/applications/ColossalQA/.gitignore
new file mode 100644
index 000000000000..5f5e159a22fc
--- /dev/null
+++ b/applications/ColossalQA/.gitignore
@@ -0,0 +1,152 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+docs/.build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IDE
+.idea/
+.vscode/
+
+# macos
+*.DS_Store
+#data/
+
+docs/.build
+
+# pytorch checkpoint
+*.pt
+
+# sql
+*.db
+
+# wandb log
+example/wandb/
+example/ui/gradio/
+example/vector_db_for_test
+examples/awesome-chatgpt-prompts/
diff --git a/applications/ColossalQA/README.md b/applications/ColossalQA/README.md
new file mode 100644
index 000000000000..d9ffe5beb605
--- /dev/null
+++ b/applications/ColossalQA/README.md
@@ -0,0 +1,258 @@
+# ColossalQA - Langchain-based Document Retrieval Conversation System
+
+## Table of Contents
+
+- [Table of Contents](#table-of-contents)
+- [Overall Implementation](#overall-implementation)
+- [Install](#install)
+- [How to Use](#how-to-use)
+- Examples
+ - [A Simple Web UI Demo](examples/webui_demo/README.md)
+ - [Local Chinese Retrieval QA + Chat](examples/retrieval_conversation_zh.py)
+ - [Local English Retrieval QA + Chat](examples/retrieval_conversation_en.py)
+ - [Local Bi-lingual Retrieval QA + Chat](examples/retrieval_conversation_universal.py)
+ - [Experimental AI Agent Based on Chatgpt + Chat](examples/conversation_agent_chatgpt.py)
+- Use cases
+ - [English customer service chatbot](examples/retrieval_conversation_en_customer_service.py)
+ - [Chinese customer service intent classification](examples/retrieval_intent_classification_zh_customer_service.py)
+
+**As Colossal-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
+
+## Overall Implementation
+
+### Highlevel Design
+
+
+
+
+Fig.1. Design of the document retrieval conversation system
+
+
+Retrieval-based Question Answering (QA) is a crucial application of natural language processing that aims to find the most relevant answers based on the information from a corpus of text documents in response to user queries. Vector stores, which represent documents and queries as vectors in a high-dimensional space, have gained popularity for their effectiveness in retrieval QA tasks.
+
+#### Step 1: Collect Data
+
+A successful retrieval QA system starts with high-quality data. You need a collection of text documents that's related to your application. You may also need to manually design how your data will be presented to the language model.
+
+#### Step 2: Split Data
+
+Document data is usually too long to fit into the prompt due to the context length limitation of LLMs. Supporting documents need to be splited into short chunks before constructing vector stores. In this demo, we use neural text spliter for better performance.
+
+#### Step 3: Construct Vector Stores
+Choose a embedding function and embed your text chunk into high dimensional vectors. Once you have vectors for your documents, you need to create a vector store. The vector store should efficiently index and retrieve documents based on vector similarity. In this demo, we use [Chroma](https://python.langchain.com/docs/integrations/vectorstores/chroma) and incrementally update indexes of vector stores. Through incremental update, one can update and maintain a vector store without recalculating every embedding.
+You are free to choose any vectorstore from a varity of [vector stores](https://python.langchain.com/docs/integrations/vectorstores/) supported by Langchain. However, the incremental update only works with LangChain vectorstore's that support:
+- Document addition by id (add_documents method with ids argument)
+- Delete by id (delete method with)
+
+#### Step 4: Retrieve Relative Text
+Upon querying, we will run a reference resolution on user's input, the goal of this step is to remove ambiguous reference in user's query such as "this company", "him". We then embed the query with the same embedding function and query the vectorstore to retrieve the top-k most similar documents.
+
+#### Step 5: Format Prompt
+The prompt carries essential information including task description, conversation history, retrived documents, and user's query for the LLM to generate a response. Please refer to this [README](./colossalqa/prompt/README.md) for more details.
+
+#### Step 6: Inference
+Pass the prompt to the LLM with additional generaton arguments to get agent response. You can control the generation with additional arguments such as temperature, top_k, top_p, max_new_tokens. You can also define when to stop by passing the stop substring to the retrieval QA chain.
+
+#### Step 7: Update Memory
+We designed a memory module that automatically summarize overlength conversation to fit the max context length of LLM. In this step, we update the memory with the newly generated response. To fix into the context length of a given LLM, we sumarize the overlength part of historical conversation and present the rest in round-based conversation format. Fig.2. shows how the memory is updated. Please refer to this [README](./colossalqa/prompt/README.md) for dialogue format.
+
+
+
+Fig.2. Design of the memory module
+
+
+### Supported Language Models (LLMs) and Embedding Models
+
+Our platform accommodates two kinds of LLMs: API-accessible and locally run models. For the API-style LLMs, we support ChatGPT, Pangu, and models deployed through the vLLM API Server. For locally operated LLMs, we are compatible with any language model that can be initiated using [`transformers.AutoModel.from_pretrained`](https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#transformers.AutoModel.from_pretrained). However, due to the dependence of retrieval-based QA on the language model's abilities in zero-shot learning, instruction following, and logical reasoning, smaller models are typically not advised. In our local demo, we utilize ChatGLM2 for Chinese and LLaMa2 for English. Modifying the base LLM requires corresponding adjustments to the prompts.
+
+Here are some sample codes to load different types of LLM.
+
+```python
+# For locally-run LLM
+from colossalqa.local.llm import ColossalAPI, ColossalLLM
+api = ColossalAPI('chatglm2', 'path_to_chatglm2_checkpoint')
+llm = ColossalLLM(n=1, api=api)
+
+# For LLMs running on the vLLM API Server
+from colossalqa.local.llm import VllmAPI, VllmLLM
+vllm_api = VllmAPI("Your_vLLM_Host", "Your_vLLM_Port")
+llm = VllmLLM(n=1, api=vllm_api)
+
+# For ChatGPT LLM
+from langchain.llms import OpenAI
+llm = OpenAI(openai_api_key="YOUR_OPENAI_API_KEY")
+
+# For Pangu LLM
+# set up your authentification info
+from colossalqa.local.pangu_llm import Pangu
+os.environ["URL"] = ""
+os.environ["URLNAME"] = ""
+os.environ["PASSWORD"] = ""
+os.environ["DOMAIN_NAME"] = ""
+
+llm = Pangu(id=1)
+llm.set_auth_config()
+```
+
+Regarding embedding models, we support all models that can be loaded via ["langchain.embeddings.HuggingFaceEmbeddings"](https://api.python.langchain.com/en/latest/embeddings/langchain.embeddings.huggingface.HuggingFaceEmbeddings.html). The default embedding model used in this demo is ["moka-ai/m3e-base"](https://huggingface.co/moka-ai/m3e-base), which enables consistent text similarity computations in both Chinese and English.
+
+In the future, supported LLM will also include models running on colossal inference and serving framework.
+
+## Install
+
+Install colossalqa
+```bash
+# python==3.8.17
+cd ColossalAI/applications/ColossalQA
+pip install -e .
+```
+
+To use the vLLM for providing LLM services via an API, please consult the official guide [here](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html#api-server) to start the API server. It's important to set up a new virtual environment for installing vLLM, as there are currently some dependency conflicts between vLLM and ColossalQA when installed on the same machine.
+
+## How to Use
+
+### Collect Your Data
+
+For ChatGPT based Agent we support document retrieval and simple sql search.
+If you want to run the demo locally, we provided document retrieval based conversation system built upon langchain. It accept a wide range of documents. After collecting your data, put your data under a folder.
+
+Read comments under ./colossalqa/data_loader for more detail regarding supported data formats.
+
+### Run The Script
+
+We provide a simple Web UI demo of ColossalQA, enabling you to upload your files as a knowledge base and interact with them through a chat interface in your browser. More details can be found [here](examples/webui_demo/README.md)
+
+
+We also provided some scripts for Chinese document retrieval based conversation system, English document retrieval based conversation system, Bi-lingual document retrieval based conversation system and an experimental AI agent with document retrieval and SQL query functionality. The Bi-lingual one is a high-level wrapper for the other two clases. We write different scripts for different languages because retrieval QA requires different embedding models, LLMs, prompts for different language setting. For now, we use LLaMa2 for English retrieval QA and ChatGLM2 for Chinese retrieval QA for better performance.
+
+To run the bi-lingual scripts.
+```bash
+python retrieval_conversation_universal.py \
+ --en_model_path /path/to/Llama-2-7b-hf \
+ --zh_model_path /path/to/chatglm2-6b \
+ --zh_model_name chatglm2 \
+ --en_model_name llama \
+ --sql_file_path /path/to/any/folder
+```
+
+To run retrieval_conversation_en.py.
+```bash
+python retrieval_conversation_en.py \
+ --model_path /path/to/Llama-2-7b-hf \
+ --model_name llama \
+ --sql_file_path /path/to/any/folder
+```
+
+To run retrieval_conversation_zh.py.
+```bash
+python retrieval_conversation_zh.py \
+ --model_path /path/to/chatglm2-6b \
+ --model_name chatglm2 \
+ --sql_file_path /path/to/any/folder
+```
+
+To run retrieval_conversation_chatgpt.py.
+```bash
+python retrieval_conversation_chatgpt.py \
+ --open_ai_key_path /path/to/plain/text/openai/key/file \
+ --sql_file_path /path/to/any/folder
+```
+
+To run conversation_agent_chatgpt.py.
+```bash
+python conversation_agent_chatgpt.py \
+ --open_ai_key_path /path/to/plain/text/openai/key/file
+```
+
+After runing the script, it will ask you to provide the path to your data during the execution of the script. You can also pass a glob path to load multiple files at once. Please read this [guide](https://docs.python.org/3/library/glob.html) on how to define glob path. Follow the instruction and provide all files for your retrieval conversation system then type "ESC" to finish loading documents. If csv files are provided, please use "," as delimiter and "\"" as quotation mark. For json and jsonl files. The default format is
+```
+{
+ "data":[
+ {"content":"XXX"},
+ {"content":"XXX"}
+ ...
+ ]
+}
+```
+For other formats, please refer to [this document](https://python.langchain.com/docs/modules/data_connection/document_loaders/json) on how to define schema for data loading. There are no other formatting constraints for loading documents type files. For loading table type files, we use pandas, please refer to [Pandas-Input/Output](https://pandas.pydata.org/pandas-docs/stable/reference/io.html) for file format details.
+
+We also support another kay-value mode that utilizes a user-defined key to calculate the embeddings of the vector store. If a query matches a specific key, the value corresponding to that key will be used to generate the prompt. For instance, in the document below, "My coupon isn't working." will be employed during indexing, whereas "Question: My coupon isn't working.\nAnswer: We apologize for ... apply it to?" will appear in the final prompt. This format is typically useful when the task involves carrying on a conversation with readily accessible conversation data, such as customer service, question answering.
+```python
+Document(page_content="My coupon isn't working.", metadata={'is_key_value_mapping': True, 'seq_num': 36, 'source': 'XXX.json', 'value': "Question: My coupon isn't working.\nAnswer:We apologize for the inconvenience. Can you please provide the coupon code and the product name or SKU you're trying to apply it to?"})
+```
+
+For now, we only support the key-value mode for json data files. You can run the script retrieval_conversation_en_customer_service.py by the following command.
+
+```bash
+python retrieval_conversation_en_customer_service.py \
+ --model_path /path/to/Llama-2-7b-hf \
+ --model_name llama \
+ --sql_file_path /path/to/any/folder
+```
+
+## The Plan
+
+- [x] build document retrieval QA tool
+- [x] Add memory
+- [x] Add demo for AI agent with SQL query
+- [x] Add customer retriever for fast construction and retrieving (with incremental update)
+
+## Reference
+
+```bibtex
+@software{Chase_LangChain_2022,
+author = {Chase, Harrison},
+month = oct,
+title = {{LangChain}},
+url = {https://github.com/hwchase17/langchain},
+year = {2022}
+}
+```
+```bibtex
+@inproceedings{DBLP:conf/asru/ZhangCLLW21,
+ author = {Qinglin Zhang and
+ Qian Chen and
+ Yali Li and
+ Jiaqing Liu and
+ Wen Wang},
+ title = {Sequence Model with Self-Adaptive Sliding Window for Efficient Spoken
+ Document Segmentation},
+ booktitle = {{IEEE} Automatic Speech Recognition and Understanding Workshop, {ASRU}
+ 2021, Cartagena, Colombia, December 13-17, 2021},
+ pages = {411--418},
+ publisher = {{IEEE}},
+ year = {2021},
+ url = {https://doi.org/10.1109/ASRU51503.2021.9688078},
+ doi = {10.1109/ASRU51503.2021.9688078},
+ timestamp = {Wed, 09 Feb 2022 09:03:04 +0100},
+ biburl = {https://dblp.org/rec/conf/asru/ZhangCLLW21.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+```bibtex
+@misc{touvron2023llama,
+ title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
+ author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
+ year={2023},
+ eprint={2307.09288},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
+```bibtex
+@article{zeng2022glm,
+ title={Glm-130b: An open bilingual pre-trained model},
+ author={Zeng, Aohan and Liu, Xiao and Du, Zhengxiao and Wang, Zihan and Lai, Hanyu and Ding, Ming and Yang, Zhuoyi and Xu, Yifan and Zheng, Wendi and Xia, Xiao and others},
+ journal={arXiv preprint arXiv:2210.02414},
+ year={2022}
+}
+```
+```bibtex
+@inproceedings{du2022glm,
+ title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
+ author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
+ booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+ pages={320--335},
+ year={2022}
+}
+```
diff --git a/applications/ColossalQA/colossalqa/__init__.py b/applications/ColossalQA/colossalqa/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalQA/colossalqa/chain/__init__.py b/applications/ColossalQA/colossalqa/chain/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalQA/colossalqa/chain/memory/__init__.py b/applications/ColossalQA/colossalqa/chain/memory/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalQA/colossalqa/chain/memory/summary.py b/applications/ColossalQA/colossalqa/chain/memory/summary.py
new file mode 100644
index 000000000000..1d63bbc4a47e
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/chain/memory/summary.py
@@ -0,0 +1,103 @@
+"""
+Custom SummarizerMixin base class and ConversationSummaryMemory class
+
+Modified from Original Source
+
+This code is based on LangChain Ai's langchain, which can be found at
+https://github.com/langchain-ai/langchain
+The original code is licensed under the MIT license.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Type
+
+from langchain.chains.llm import LLMChain
+from langchain.memory.chat_memory import BaseChatMemory
+from langchain.memory.prompt import SUMMARY_PROMPT
+from langchain.pydantic_v1 import BaseModel, root_validator
+from langchain.schema import BaseChatMessageHistory, BasePromptTemplate
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.schema.messages import BaseMessage, SystemMessage, get_buffer_string
+
+
+class SummarizerMixin(BaseModel):
+ """
+ Mixin for summarizer.
+ """
+
+ human_prefix: str = "Human"
+ ai_prefix: str = "Assistant"
+ llm: BaseLanguageModel
+ prompt: BasePromptTemplate = SUMMARY_PROMPT
+ summary_message_cls: Type[BaseMessage] = SystemMessage
+ llm_kwargs: Dict = {}
+
+ def predict_new_summary(self, messages: List[BaseMessage], existing_summary: str, stop: List = []) -> str:
+ """
+ Recursively summarize a conversation by generating a new summary using
+ the last round of conversation and the existing summary.
+ """
+ new_lines = get_buffer_string(
+ messages,
+ human_prefix=self.human_prefix,
+ ai_prefix=self.ai_prefix,
+ )
+
+ chain = LLMChain(llm=self.llm, prompt=self.prompt, llm_kwargs=self.llm_kwargs)
+ return chain.predict(summary=existing_summary, new_lines=new_lines, stop=stop)
+
+
+class ConversationSummaryMemory(BaseChatMemory, SummarizerMixin):
+ """Conversation summarizer to chat memory."""
+
+ buffer: str = ""
+ memory_key: str = "history"
+
+ @classmethod
+ def from_messages(
+ cls,
+ llm: BaseLanguageModel,
+ chat_memory: BaseChatMessageHistory,
+ summarize_step: int = 2,
+ **kwargs: Any,
+ ) -> ConversationSummaryMemory:
+ obj = cls(llm=llm, chat_memory=chat_memory, **kwargs)
+ for i in range(0, len(obj.chat_memory.messages), summarize_step):
+ obj.buffer = obj.predict_new_summary(obj.chat_memory.messages[i : i + summarize_step], obj.buffer)
+ return obj
+
+ @property
+ def memory_variables(self) -> List[str]:
+ """Will always return list of memory variables."""
+ return [self.memory_key]
+
+ def load_memory_variables(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+ """Return history buffer."""
+ if self.return_messages:
+ buffer: Any = [self.summary_message_cls(content=self.buffer)]
+ else:
+ buffer = self.buffer
+ return {self.memory_key: buffer}
+
+ @root_validator()
+ def validate_prompt_input_variables(cls, values: Dict) -> Dict:
+ """Validate that prompt input variables are consistent."""
+ prompt_variables = values["prompt"].input_variables
+ expected_keys = {"summary", "new_lines"}
+ if expected_keys != set(prompt_variables):
+ raise ValueError(
+ "Got unexpected prompt input variables. The prompt expects "
+ f"{prompt_variables}, but it should have {expected_keys}."
+ )
+ return values
+
+ def save_context(self, inputs: Dict[str, Any], outputs: Dict[str, str]) -> None:
+ """Save context from this conversation to buffer."""
+ super().save_context(inputs, outputs)
+ self.buffer = self.predict_new_summary(self.chat_memory.messages[-2:], self.buffer)
+
+ def clear(self) -> None:
+ """Clear memory contents."""
+ super().clear()
+ self.buffer = ""
diff --git a/applications/ColossalQA/colossalqa/chain/retrieval_qa/__init__.py b/applications/ColossalQA/colossalqa/chain/retrieval_qa/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py b/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py
new file mode 100644
index 000000000000..e80befdaccfa
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py
@@ -0,0 +1,214 @@
+"""
+Chain for question-answering against a vector database.
+
+Modified from Original Source
+
+This code is based on LangChain Ai's langchain, which can be found at
+https://github.com/langchain-ai/langchain
+The original code is licensed under the MIT license.
+"""
+from __future__ import annotations
+
+import copy
+import inspect
+from typing import Any, Dict, List, Optional
+
+from colossalqa.chain.retrieval_qa.load_chain import load_qa_chain
+from colossalqa.chain.retrieval_qa.stuff import CustomStuffDocumentsChain
+from langchain.callbacks.manager import AsyncCallbackManagerForChainRun, CallbackManagerForChainRun, Callbacks
+from langchain.chains.llm import LLMChain
+from langchain.chains.question_answering.stuff_prompt import PROMPT_SELECTOR
+from langchain.chains.retrieval_qa.base import BaseRetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain.pydantic_v1 import Field
+from langchain.schema import BaseRetriever, Document
+from langchain.schema.language_model import BaseLanguageModel
+
+class CustomBaseRetrievalQA(BaseRetrievalQA):
+ """Base class for question-answering chains."""
+
+ @classmethod
+ def from_llm(
+ cls,
+ llm: BaseLanguageModel,
+ prompt: Optional[PromptTemplate] = None,
+ callbacks: Callbacks = None,
+ **kwargs: Any,
+ ) -> BaseRetrievalQA:
+ """Initialize from LLM."""
+ llm_kwargs = kwargs.pop("llm_kwargs", {})
+ _prompt = prompt or PROMPT_SELECTOR.get_prompt(llm)
+ llm_chain = LLMChain(llm=llm, prompt=_prompt, callbacks=callbacks, llm_kwargs=llm_kwargs)
+ document_prompt = kwargs.get(
+ "document_prompt", PromptTemplate(input_variables=["page_content"], template="Context:\n{page_content}")
+ )
+ combine_documents_chain = CustomStuffDocumentsChain(
+ llm_chain=llm_chain,
+ document_variable_name="context",
+ document_prompt=document_prompt,
+ callbacks=callbacks,
+ )
+
+ return cls(
+ combine_documents_chain=combine_documents_chain,
+ callbacks=callbacks,
+ **kwargs,
+ )
+
+ @classmethod
+ def from_chain_type(
+ cls,
+ llm: BaseLanguageModel,
+ chain_type: str = "stuff",
+ chain_type_kwargs: Optional[dict] = None,
+ **kwargs: Any,
+ ) -> BaseRetrievalQA:
+ """Load chain from chain type."""
+ llm_kwargs = kwargs.pop("llm_kwargs", {})
+ _chain_type_kwargs = chain_type_kwargs or {}
+ combine_documents_chain = load_qa_chain(llm, chain_type=chain_type, **_chain_type_kwargs, llm_kwargs=llm_kwargs)
+ return cls(combine_documents_chain=combine_documents_chain, **kwargs)
+
+ def _call(
+ self,
+ inputs: Dict[str, Any],
+ run_manager: Optional[CallbackManagerForChainRun] = None,
+ ) -> Dict[str, Any]:
+ """Run get_relevant_text and llm on input query.
+
+ If chain has 'return_source_documents' as 'True', returns
+ the retrieved documents as well under the key 'source_documents'.
+
+ Example:
+ .. code-block:: python
+
+ res = indexqa({'query': 'This is my query'})
+ answer, docs = res['result'], res['source_documents']
+ """
+ _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+ question = inputs[self.input_key]
+ accepts_run_manager = "run_manager" in inspect.signature(self._get_docs).parameters
+ if accepts_run_manager:
+ docs = self._get_docs(question, run_manager=_run_manager)
+ else:
+ docs = self._get_docs(question) # type: ignore[call-arg]
+
+ kwargs = {
+ k: v
+ for k, v in inputs.items()
+ if k in ["stop", "temperature", "top_k", "top_p", "max_new_tokens", "doc_prefix"]
+ }
+ answers = []
+ if self.combine_documents_chain.memory is not None:
+ buffered_history_backup, summarized_history_temp_backup = copy.deepcopy(
+ self.combine_documents_chain.memory.buffered_history
+ ), copy.deepcopy(self.combine_documents_chain.memory.summarized_history_temp)
+ else:
+ buffered_history_backup = None
+ summarized_history_temp_backup = None
+
+ answer = self.combine_documents_chain.run(
+ input_documents=docs, question=question, callbacks=_run_manager.get_child(), **kwargs
+ )
+ if summarized_history_temp_backup is not None and buffered_history_backup is not None:
+ (
+ self.combine_documents_chain.memory.buffered_history,
+ self.combine_documents_chain.memory.summarized_history_temp,
+ ) = copy.deepcopy(buffered_history_backup), copy.deepcopy(summarized_history_temp_backup)
+
+ # if rejection_trigger_keywords is not given, return the response from LLM directly
+ rejection_trigger_keywrods = inputs.get('rejection_trigger_keywrods', [])
+ answer = answer if all([rej not in answer for rej in rejection_trigger_keywrods]) else None
+ if answer is None:
+ answer = inputs.get('rejection_answer', "抱歉,根据提供的信息无法回答该问题。")
+ if self.combine_documents_chain.memory is not None:
+ self.combine_documents_chain.memory.save_context({"question": question}, {"output": answer})
+
+ if self.return_source_documents:
+ return {self.output_key: answer, "source_documents": docs}
+ else:
+ return {self.output_key: answer}
+
+ async def _acall(
+ self,
+ inputs: Dict[str, Any],
+ run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+ ) -> Dict[str, Any]:
+ """Run get_relevant_text and llm on input query.
+
+ If chain has 'return_source_documents' as 'True', returns
+ the retrieved documents as well under the key 'source_documents'.
+
+ Example:
+ .. code-block:: python
+
+ res = indexqa({'query': 'This is my query'})
+ answer, docs = res['result'], res['source_documents']
+ """
+ _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+ question = inputs[self.input_key]
+ accepts_run_manager = "run_manager" in inspect.signature(self._aget_docs).parameters
+ if accepts_run_manager:
+ docs = await self._aget_docs(question, run_manager=_run_manager)
+ else:
+ docs = await self._aget_docs(question) # type: ignore[call-arg]
+ kwargs = {
+ k: v
+ for k, v in inputs.items()
+ if k in ["stop", "temperature", "top_k", "top_p", "max_new_tokens", "doc_prefix"]
+ }
+ answer = await self.combine_documents_chain.arun(
+ input_documents=docs, question=question, callbacks=_run_manager.get_child(), **kwargs
+ )
+ # if rejection_trigger_keywords is not given, return the response from LLM directly
+ rejection_trigger_keywrods = inputs.get('rejection_trigger_keywrods', [])
+ answer = answer if all([rej not in answer for rej in rejection_trigger_keywrods]) or len(rejection_trigger_keywrods)==0 else None
+ if answer is None:
+ answer = inputs.get('rejection_answer', "抱歉,根据提供的信息无法回答该问题。")
+ self.combine_documents_chain.memory.save_context({"question": question}, {"output": answer})
+
+ if self.return_source_documents:
+ return {self.output_key: answer, "source_documents": docs}
+ else:
+ return {self.output_key: answer}
+
+
+class RetrievalQA(CustomBaseRetrievalQA):
+ """Chain for question-answering against an index.
+
+ Example:
+ .. code-block:: python
+
+ from langchain.llms import OpenAI
+ from langchain.chains import RetrievalQA
+ from langchain.faiss import FAISS
+ from langchain.vectorstores.base import VectorStoreRetriever
+ retriever = VectorStoreRetriever(vectorstore=FAISS(...))
+ retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=retriever)
+
+ """
+
+ retriever: BaseRetriever = Field(exclude=True)
+
+ def _get_docs(
+ self,
+ question: str,
+ *,
+ run_manager: CallbackManagerForChainRun,
+ ) -> List[Document]:
+ """Get docs."""
+ return self.retriever.get_relevant_documents(question, callbacks=run_manager.get_child())
+
+ async def _aget_docs(
+ self,
+ question: str,
+ *,
+ run_manager: AsyncCallbackManagerForChainRun,
+ ) -> List[Document]:
+ """Get docs."""
+ return await self.retriever.aget_relevant_documents(question, callbacks=run_manager.get_child())
+
+ @property
+ def _chain_type(self) -> str:
+ """Return the chain type."""
+ return "retrieval_qa"
diff --git a/applications/ColossalQA/colossalqa/chain/retrieval_qa/load_chain.py b/applications/ColossalQA/colossalqa/chain/retrieval_qa/load_chain.py
new file mode 100644
index 000000000000..a2b1f81e34b9
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/chain/retrieval_qa/load_chain.py
@@ -0,0 +1,87 @@
+"""
+Load question answering chains.
+For now, only the stuffed chain is modified
+
+Modified from Original Source
+
+This code is based on LangChain Ai's langchain, which can be found at
+https://github.com/langchain-ai/langchain
+The original code is licensed under the MIT license.
+"""
+import copy
+from typing import Any, Mapping, Optional, Protocol
+
+from colossalqa.chain.retrieval_qa.stuff import CustomStuffDocumentsChain
+from langchain.callbacks.base import BaseCallbackManager
+from langchain.callbacks.manager import Callbacks
+from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
+from langchain.chains.llm import LLMChain
+from langchain.chains.question_answering import stuff_prompt
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.schema.prompt_template import BasePromptTemplate
+
+
+class LoadingCallable(Protocol):
+ """Interface for loading the combine documents chain."""
+
+ def __call__(self, llm: BaseLanguageModel, **kwargs: Any) -> BaseCombineDocumentsChain:
+ """Callable to load the combine documents chain."""
+
+
+def _load_stuff_chain(
+ llm: BaseLanguageModel,
+ prompt: Optional[BasePromptTemplate] = None,
+ document_variable_name: str = "context",
+ verbose: Optional[bool] = None,
+ callback_manager: Optional[BaseCallbackManager] = None,
+ callbacks: Callbacks = None,
+ **kwargs: Any,
+) -> CustomStuffDocumentsChain:
+ _prompt = prompt or stuff_prompt.PROMPT_SELECTOR.get_prompt(llm)
+ if "llm_kwargs" in kwargs:
+ llm_kwargs = copy.deepcopy(kwargs["llm_kwargs"])
+ del kwargs["llm_kwargs"]
+ else:
+ llm_kwargs = {}
+ llm_chain = LLMChain(
+ llm=llm,
+ prompt=_prompt,
+ verbose=verbose,
+ callback_manager=callback_manager,
+ callbacks=callbacks,
+ llm_kwargs=llm_kwargs,
+ )
+ return CustomStuffDocumentsChain(
+ llm_chain=llm_chain,
+ document_variable_name=document_variable_name,
+ verbose=verbose,
+ callback_manager=callback_manager,
+ callbacks=callbacks,
+ **kwargs,
+ )
+
+
+def load_qa_chain(
+ llm: BaseLanguageModel,
+ chain_type: str = "stuff",
+ verbose: Optional[bool] = None,
+ callback_manager: Optional[BaseCallbackManager] = None,
+ **kwargs: Any,
+) -> BaseCombineDocumentsChain:
+ """Load question answering chain.
+
+ Args:
+ llm: Language Model to use in the chain.
+ chain_type: Type of document combining chain to use. Should be one of "stuff",
+ "map_reduce", "map_rerank", and "refine".
+ verbose: Whether chains should be run in verbose mode or not. Note that this
+ applies to all chains that make up the final chain.
+ callback_manager: Callback manager to use for the chain.
+
+ Returns:
+ A chain to use for question answering.
+ """
+ loader_mapping: Mapping[str, LoadingCallable] = {"stuff": _load_stuff_chain}
+ if chain_type not in loader_mapping:
+ raise ValueError(f"Got unsupported chain type: {chain_type}. " f"Should be one of {loader_mapping.keys()}")
+ return loader_mapping[chain_type](llm, verbose=verbose, callback_manager=callback_manager, **kwargs)
diff --git a/applications/ColossalQA/colossalqa/chain/retrieval_qa/stuff.py b/applications/ColossalQA/colossalqa/chain/retrieval_qa/stuff.py
new file mode 100644
index 000000000000..bf7ad0ffce28
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/chain/retrieval_qa/stuff.py
@@ -0,0 +1,91 @@
+"""
+Chain that combines documents by stuffing into context
+
+Modified from Original Source
+
+This code is based on LangChain Ai's langchain, which can be found at
+https://github.com/langchain-ai/langchain
+The original code is licensed under the MIT license.
+"""
+import copy
+from typing import Any, List
+
+from langchain.chains.combine_documents.stuff import StuffDocumentsChain
+from langchain.docstore.document import Document
+from langchain.schema import format_document
+
+
+class CustomStuffDocumentsChain(StuffDocumentsChain):
+ """Chain that combines documents by stuffing into context.
+
+ This chain takes a list of documents and first combines them into a single string.
+ It does this by formatting each document into a string with the `document_prompt`
+ and then joining them together with `document_separator`. It then adds that new
+ string to the inputs with the variable name set by `document_variable_name`.
+ Those inputs are then passed to the `llm_chain`.
+
+ Example:
+ .. code-block:: python
+
+ from langchain.chains import StuffDocumentsChain, LLMChain
+ from langchain.prompts import PromptTemplate
+ from langchain.llms import OpenAI
+
+ # This controls how each document will be formatted. Specifically,
+ # it will be passed to `format_document` - see that function for more
+ # details.
+ document_prompt = PromptTemplate(
+ input_variables=["page_content"],
+ template="{page_content}"
+ )
+ document_variable_name = "context"
+ llm = OpenAI()
+ # The prompt here should take as an input variable the
+ # `document_variable_name`
+ prompt = PromptTemplate.from_template(
+ "Summarize this content: {context}"
+ )
+ llm_chain = LLMChain(llm=llm, prompt=prompt)
+ chain = StuffDocumentsChain(
+ llm_chain=llm_chain,
+ document_prompt=document_prompt,
+ document_variable_name=document_variable_name
+ )
+ """
+
+ def _get_inputs(self, docs: List[Document], **kwargs: Any) -> dict:
+ """Construct inputs from kwargs and docs.
+
+ Format and the join all the documents together into one input with name
+ `self.document_variable_name`. The pluck any additional variables
+ from **kwargs.
+
+ Args:
+ docs: List of documents to format and then join into single input
+ **kwargs: additional inputs to chain, will pluck any other required
+ arguments from here.
+
+ Returns:
+ dictionary of inputs to LLMChain
+ """
+ # Format each document according to the prompt
+
+ # if the document is in the key-value format has a 'is_key_value_mapping'=True in meta_data and has 'value' in metadata
+ # use the value to replace the key
+ doc_prefix = kwargs.get("doc_prefix", "Supporting Document")
+ docs_ = []
+ for id, doc in enumerate(docs):
+ doc_ = copy.deepcopy(doc)
+ if doc_.metadata.get("is_key_value_mapping", False) and "value" in doc_.metadata:
+ doc_.page_content = str(doc_.metadata["value"])
+ prefix = doc_prefix + str(id)
+ doc_.page_content = str(prefix + ":" + (" " if doc_.page_content[0] != " " else "") + doc_.page_content)
+ docs_.append(doc_)
+
+ doc_strings = [format_document(doc, self.document_prompt) for doc in docs_]
+ arg_list = ["stop", "temperature", "top_k", "top_p", "max_new_tokens"]
+ arg_list.extend(self.llm_chain.prompt.input_variables)
+ # Join the documents together to put them in the prompt.
+ inputs = {k: v for k, v in kwargs.items() if k in arg_list}
+ inputs[self.document_variable_name] = self.document_separator.join(doc_strings)
+ return inputs
diff --git a/applications/ColossalQA/colossalqa/data_loader/__init__.py b/applications/ColossalQA/colossalqa/data_loader/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalQA/colossalqa/data_loader/document_loader.py b/applications/ColossalQA/colossalqa/data_loader/document_loader.py
new file mode 100644
index 000000000000..0fe1e4d1a00c
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/data_loader/document_loader.py
@@ -0,0 +1,128 @@
+"""
+Class for loading document type data
+"""
+
+import glob
+from typing import List
+
+from colossalqa.mylogging import get_logger
+from langchain.document_loaders import (
+ JSONLoader,
+ PyPDFLoader,
+ TextLoader,
+ UnstructuredHTMLLoader,
+ UnstructuredMarkdownLoader,
+)
+from langchain.document_loaders.csv_loader import CSVLoader
+
+logger = get_logger()
+
+SUPPORTED_DATA_FORMAT = [".csv", ".json", ".html", ".md", ".pdf", ".txt", ".jsonl"]
+
+
+class DocumentLoader:
+ """
+ Load documents from different files into list of langchain Documents
+ """
+
+ def __init__(self, files: List, **kwargs) -> None:
+ """
+ Args:
+ files: list of files (list[file path, name])
+ **kwargs: keyword type arguments, useful for certain document types
+ """
+ self.data = {}
+ self.kwargs = kwargs
+
+ for item in files:
+ path = item[0] if isinstance(item, list) else item
+ logger.info(f"Loading data from {path}")
+ self.load_data(path)
+ logger.info("Data loaded")
+
+ self.all_data = []
+ for key in self.data:
+ if isinstance(self.data[key], list):
+ for item in self.data[key]:
+ if isinstance(item, list):
+ self.all_data.extend(item)
+ else:
+ self.all_data.append(item)
+
+ def load_data(self, path: str) -> None:
+ """
+ Load data. Please refer to https://python.langchain.com/docs/modules/data_connection/document_loaders/
+ for sepcific format requirements.
+ Args:
+ path: path to a file
+ To load files with glob path, here are some examples.
+ Load all file from directory: folder1/folder2/*
+ Load all pdf file from directory: folder1/folder2/*.pdf
+ """
+ files = []
+
+ # Handle glob expression
+ try:
+ files = glob.glob(path)
+ except Exception as e:
+ logger.error(e)
+ if len(files) == 0:
+ raise ValueError("Unsupported file/directory format. For directories, please use glob expression")
+ elif len(files) == 1:
+ path = files[0]
+ else:
+ for file in files:
+ self.load_data(file)
+ return
+
+ # Load data if the path is a file
+ logger.info(f"load {path}", verbose=True)
+ if path.endswith(".csv"):
+ # Load csv
+ loader = CSVLoader(file_path=path, encoding="utf8")
+ data = loader.load()
+ self.data[path] = data
+ elif path.endswith(".txt"):
+ # Load txt
+ loader = TextLoader(path, encoding="utf8")
+ data = loader.load()
+ self.data[path] = data
+ elif path.endswith("html"):
+ # Load html
+ loader = UnstructuredHTMLLoader(path, encoding="utf8")
+ data = loader.load()
+ self.data[path] = data
+ elif path.endswith("json"):
+ # Load json
+ loader = JSONLoader(
+ file_path=path,
+ jq_schema=self.kwargs.get("jq_schema", ".data[]"),
+ content_key=self.kwargs.get("content_key", "content"),
+ metadata_func=self.kwargs.get("metadata_func", None),
+ )
+
+ data = loader.load()
+ self.data[path] = data
+ elif path.endswith("jsonl"):
+ # Load jsonl
+ loader = JSONLoader(
+ file_path=path, jq_schema=self.kwargs.get("jq_schema", ".data[].content"), json_lines=True
+ )
+ data = loader.load()
+ self.data[path] = data
+ elif path.endswith(".md"):
+ # Load markdown
+ loader = UnstructuredMarkdownLoader(path)
+ data = loader.load()
+ self.data[path] = data
+ elif path.endswith(".pdf"):
+ # Load pdf
+ loader = PyPDFLoader(path)
+ data = loader.load_and_split()
+ self.data[path] = data
+ else:
+ if "." in path.split("/")[-1]:
+ raise ValueError(f"Unsupported file format {path}. Supported formats: {SUPPORTED_DATA_FORMAT}")
+ else:
+ # May ba a directory, we strictly follow the glob path and will not load files in subdirectories
+ pass
diff --git a/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py b/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py
new file mode 100644
index 000000000000..cad48254498e
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py
@@ -0,0 +1,119 @@
+'''
+Class for loading table type data. please refer to Pandas-Input/Output for file format details.
+'''
+
+
+import os
+import glob
+import pandas as pd
+from sqlalchemy import create_engine
+from colossalqa.utils import drop_table
+from colossalqa.mylogging import get_logger
+
+logger = get_logger()
+
+SUPPORTED_DATA_FORMAT = ['.csv','.xlsx', '.xls','.json','.html','.h5', '.hdf5','.parquet','.feather','.dta']
+
+class TableLoader:
+ '''
+ Load tables from different files and serve a sql database for database operations
+ '''
+ def __init__(self, files: str,
+ sql_path:str='sqlite:///mydatabase.db',
+ verbose=False, **kwargs) -> None:
+ '''
+ Args:
+ files: list of files (list[file path, name])
+ sql_path: how to serve the sql database
+ **kwargs: keyword type arguments, useful for certain document types
+ '''
+ self.data = {}
+ self.verbose = verbose
+ self.sql_path = sql_path
+ self.kwargs = kwargs
+ self.sql_engine = create_engine(self.sql_path)
+ drop_table(self.sql_engine)
+
+ self.sql_engine = create_engine(self.sql_path)
+ for item in files:
+ path = item[0]
+ dataset_name = item[1]
+ if not os.path.exists(path):
+ raise FileNotFoundError(f"{path} doesn't exists")
+ if not any([path.endswith(i) for i in SUPPORTED_DATA_FORMAT]):
+ raise TypeError(f"{path} not supported. Supported type {SUPPORTED_DATA_FORMAT}")
+
+ logger.info("loading data", verbose=self.verbose)
+ self.load_data(path)
+ logger.info("data loaded", verbose=self.verbose)
+ self.to_sql(path, dataset_name)
+
+ def load_data(self, path):
+ '''
+ Load data and serve the data as sql database.
+ Data must be in pandas format
+ '''
+ files = []
+ # Handle glob expression
+ try:
+ files = glob.glob(path)
+ except Exception as e:
+ logger.error(e)
+ if len(files)==0:
+ raise ValueError("Unsupported file/directory format. For directories, please use glob expression")
+ elif len(files)==1:
+ path = files[0]
+ else:
+ for file in files:
+ self.load_data(file)
+
+ if path.endswith('.csv'):
+ # Load csv
+ self.data[path] = pd.read_csv(path)
+ elif path.endswith('.xlsx') or path.endswith('.xls'):
+ # Load excel
+ self.data[path] = pd.read_excel(path) # You can adjust the sheet_name as needed
+ elif path.endswith('.json'):
+ # Load json
+ self.data[path] = pd.read_json(path)
+ elif path.endswith('.html'):
+ # Load html
+ html_tables = pd.read_html(path)
+ # Choose the desired table from the list of DataFrame objects
+ self.data[path] = html_tables[0] # You may need to adjust this index
+ elif path.endswith('.h5') or path.endswith('.hdf5'):
+ # Load h5
+ self.data[path] = pd.read_hdf(path, key=self.kwargs.get('key', 'data')) # You can adjust the key as needed
+ elif path.endswith('.parquet'):
+ # Load parquet
+ self.data[path] = pd.read_parquet(path, engine='fastparquet')
+ elif path.endswith('.feather'):
+ # Load feather
+ self.data[path] = pd.read_feather(path)
+ elif path.endswith('.dta'):
+ # Load dta
+ self.data[path] = pd.read_stata(path)
+ else:
+ raise ValueError("Unsupported file format")
+
+ def to_sql(self, path, table_name):
+ '''
+ Serve the data as sql database.
+ '''
+ self.data[path].to_sql(table_name, con=self.sql_engine, if_exists='replace', index=False)
+ logger.info(f"Loaded to Sqlite3\nPath: {path}", verbose=self.verbose)
+ return self.sql_path
+
+ def get_sql_path(self):
+ return self.sql_path
+
+ def __del__(self):
+ if self.sql_engine:
+ drop_table(self.sql_engine)
+ self.sql_engine.dispose()
+ del self.data
+ del self.sql_engine
+
+
+
+
diff --git a/applications/ColossalQA/colossalqa/local/__init__.py b/applications/ColossalQA/colossalqa/local/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalQA/colossalqa/local/colossalcloud_llm.py b/applications/ColossalQA/colossalqa/local/colossalcloud_llm.py
new file mode 100644
index 000000000000..62aead66c54b
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/local/colossalcloud_llm.py
@@ -0,0 +1,125 @@
+"""
+LLM wrapper for LLMs running on ColossalCloud Platform
+
+Usage:
+
+os.environ['URL'] = ""
+os.environ['HOST'] = ""
+
+gen_config = {
+ 'max_new_tokens': 100,
+ # 'top_k': 2,
+ 'top_p': 0.9,
+ 'temperature': 0.5,
+ 'repetition_penalty': 2,
+ }
+
+llm = ColossalCloudLLM(n=1)
+llm.set_auth_config()
+resp = llm(prompt='What do you call a three-ton kangaroo?', **gen_config)
+print(resp) # super-heavyweight awesome-natured yawning Australian creature!
+
+"""
+import json
+from typing import Any, List, Mapping, Optional
+
+import requests
+from langchain.llms.base import LLM
+from langchain.utils import get_from_dict_or_env
+
+
+class ColossalCloudLLM(LLM):
+ """
+ A custom LLM class that integrates LLMs running on the ColossalCloud Platform
+
+ """
+ n: int
+ gen_config: dict = None
+ auth_config: dict = None
+ valid_gen_para: list = ['max_new_tokens', 'top_k',
+ 'top_p', 'temperature', 'repetition_penalty']
+
+ def __init__(self, gen_config=None, **kwargs):
+ """
+ Args:
+ gen_config: config for generation,
+ max_new_tokens: 50 by default
+ top_k: (1, vocab_size)
+ top_p: (0, 1) if not None
+ temperature: (0, inf) if not None
+ repetition_penalty: (1, inf) if not None
+ """
+ super(ColossalCloudLLM, self).__init__(**kwargs)
+ if gen_config is None:
+ self.gen_config = {"max_new_tokens": 50}
+ else:
+ assert "max_new_tokens" in gen_config, "max_new_tokens is a compulsory key in the gen config"
+ self.gen_config = gen_config
+
+ @property
+ def _identifying_params(self) -> Mapping[str, Any]:
+ """Get the identifying parameters."""
+ return {"n": self.n}
+
+ @property
+ def _llm_type(self) -> str:
+ return 'ColossalCloudLLM'
+
+ def set_auth_config(self, **kwargs):
+ url = get_from_dict_or_env(kwargs, "url", "URL")
+ host = get_from_dict_or_env(kwargs, "host", "HOST")
+
+ auth_config = {}
+ auth_config['endpoint'] = url
+ auth_config['Host'] = host
+ self.auth_config = auth_config
+
+ def _call(self, prompt: str, stop=None, **kwargs: Any) -> str:
+ """
+ Args:
+ prompt: The prompt to pass into the model.
+ stop: A list of strings to stop generation when encountered
+
+ Returns:
+ The string generated by the model
+ """
+ # Update the generation arguments
+ for key, value in kwargs.items():
+ if key not in self.valid_gen_para:
+ raise KeyError(f"Invalid generation parameter: '{key}'. Valid keys are: {', '.join(self.valid_gen_para)}")
+ if key in self.gen_config:
+ self.gen_config[key] = value
+
+ resp_text = self.text_completion(prompt, self.gen_config, self.auth_config)
+ # TODO: This may cause excessive tokens count
+ if stop is not None:
+ for stopping_words in stop:
+ if stopping_words in resp_text:
+ resp_text = resp_text.split(stopping_words)[0]
+ return resp_text
+
+
+ def text_completion(self, prompt, gen_config, auth_config):
+ # Complusory Parameters
+ endpoint = auth_config.pop('endpoint')
+ max_new_tokens = gen_config.pop('max_new_tokens')
+ # Optional Parameters
+ optional_params = ['top_k', 'top_p', 'temperature', 'repetition_penalty'] # Self.optional
+ gen_config = {key: gen_config[key] for key in optional_params if key in gen_config}
+ # Define the data payload
+ data = {
+ "max_new_tokens": max_new_tokens,
+ "history": [
+ {"instruction": prompt, "response": ""}
+ ],
+ **gen_config
+ }
+ headers = {
+ "Content-Type": "application/json",
+ **auth_config # 'Host',
+ }
+ # Make the POST request
+ response = requests.post(endpoint, headers=headers, data=json.dumps(data))
+ response.raise_for_status() # raise error if return code is not 200(success)
+ # Check the response
+ return response.text
diff --git a/applications/ColossalQA/colossalqa/local/llm.py b/applications/ColossalQA/colossalqa/local/llm.py
new file mode 100644
index 000000000000..ff7346adcf61
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/local/llm.py
@@ -0,0 +1,183 @@
+"""
+API and LLM warpper class for running LLMs locally
+
+Usage:
+
+import os
+model_path = os.environ.get("ZH_MODEL_PATH")
+model_name = "chatglm2"
+colossal_api = ColossalAPI(model_name, model_path)
+llm = ColossalLLM(n=1, api=colossal_api)
+TEST_PROMPT_CHATGLM="续写文章:惊蛰一过,春寒加剧。先是料料峭峭,继而雨季开始,"
+logger.info(llm(TEST_PROMPT_CHATGLM, max_new_tokens=100), verbose=True)
+
+"""
+from typing import Any, List, Mapping, Optional
+
+import torch
+from colossalqa.local.utils import get_response, post_http_request
+from colossalqa.mylogging import get_logger
+from langchain.callbacks.manager import CallbackManagerForLLMRun
+from langchain.llms.base import LLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+logger = get_logger()
+
+
+class ColossalAPI:
+ """
+ API for calling LLM.generate
+ """
+
+ __instances = dict()
+
+ def __init__(self, model_type: str, model_path: str, ckpt_path: str = None) -> None:
+ """
+ Configurate model
+ """
+ if model_type + model_path + (ckpt_path or "") in ColossalAPI.__instances:
+ return
+ else:
+ ColossalAPI.__instances[model_type + model_path + (ckpt_path or "")] = self
+ self.model_type = model_type
+ self.model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True)
+
+ if ckpt_path is not None:
+ state_dict = torch.load(ckpt_path)
+ self.model.load_state_dict(state_dict)
+ self.model.to(torch.cuda.current_device())
+
+ # Configurate tokenizer
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+ self.model.eval()
+
+ @staticmethod
+ def get_api(model_type: str, model_path: str, ckpt_path: str = None):
+ if model_type + model_path + (ckpt_path or "") in ColossalAPI.__instances:
+ return ColossalAPI.__instances[model_type + model_path + (ckpt_path or "")]
+ else:
+ return ColossalAPI(model_type, model_path, ckpt_path)
+
+ def generate(self, input: str, **kwargs) -> str:
+ """
+ Generate response given the prompt
+ Args:
+ input: input string
+ **kwargs: language model keyword type arguments, such as top_k, top_p, temperature, max_new_tokens...
+ Returns:
+ output: output string
+ """
+ if self.model_type in ["chatglm", "chatglm2"]:
+ inputs = {
+ k: v.to(torch.cuda.current_device()) for k, v in self.tokenizer(input, return_tensors="pt").items()
+ }
+ else:
+ inputs = {
+ "input_ids": self.tokenizer(input, return_tensors="pt")["input_ids"].to(torch.cuda.current_device())
+ }
+
+ output = self.model.generate(**inputs, **kwargs)
+ output = output.cpu()
+ prompt_len = inputs["input_ids"].size(1)
+ response = output[0, prompt_len:]
+ output = self.tokenizer.decode(response, skip_special_tokens=True)
+ return output
+
+
+class VllmAPI:
+ def __init__(self, host: str = "localhost", port: int = 8077) -> None:
+ # Configurate api for model served through web
+ self.host = host
+ self.port = port
+ self.url = f"http://{self.host}:{self.port}/generate"
+
+ def generate(self, input: str, **kwargs):
+ output = get_response(post_http_request(input, self.url, **kwargs))[0]
+ return output[len(input) :]
+
+
+class ColossalLLM(LLM):
+ """
+ Langchain LLM wrapper for a local LLM
+ """
+
+ n: int
+ api: Any
+ kwargs = {"max_new_tokens": 100}
+
+ @property
+ def _llm_type(self) -> str:
+ return "custom"
+
+ def _call(
+ self,
+ prompt: str,
+ stop: Optional[List[str]] = None,
+ run_manager: Optional[CallbackManagerForLLMRun] = None,
+ **kwargs: Any,
+ ) -> str:
+ logger.info(f"kwargs:{kwargs}\nstop:{stop}\nprompt:{prompt}", verbose=self.verbose)
+ for k in self.kwargs:
+ if k not in kwargs:
+ kwargs[k] = self.kwargs[k]
+
+ generate_args = {k: kwargs[k] for k in kwargs if k not in ["stop", "n"]}
+ out = self.api.generate(prompt, **generate_args)
+ if isinstance(stop, list) and len(stop) != 0:
+ for stopping_words in stop:
+ if stopping_words in out:
+ out = out.split(stopping_words)[0]
+ logger.info(f"{prompt}{out}", verbose=self.verbose)
+ return out
+
+ @property
+ def _identifying_params(self) -> Mapping[str, int]:
+ """Get the identifying parameters."""
+ return {"n": self.n}
+
+
+class VllmLLM(LLM):
+ """
+ Langchain LLM wrapper for a local LLM
+ """
+
+ n: int
+ api: Any
+ kwargs = {"max_new_tokens": 100}
+
+ @property
+ def _llm_type(self) -> str:
+ return "custom"
+
+ def _call(
+ self,
+ prompt: str,
+ stop: Optional[List[str]] = None,
+ run_manager: Optional[CallbackManagerForLLMRun] = None,
+ **kwargs: Any,
+ ) -> str:
+ for k in self.kwargs:
+ if k not in kwargs:
+ kwargs[k] = self.kwargs[k]
+ logger.info(f"kwargs:{kwargs}\nstop:{stop}\nprompt:{prompt}", verbose=self.verbose)
+ generate_args = {k: kwargs[k] for k in kwargs if k in ["n", "max_tokens", "temperature", "stream"]}
+ out = self.api.generate(prompt, **generate_args)
+ if len(stop) != 0:
+ for stopping_words in stop:
+ if stopping_words in out:
+ out = out.split(stopping_words)[0]
+ logger.info(f"{prompt}{out}", verbose=self.verbose)
+ return out
+
+ def set_host_port(self, host: str = "localhost", port: int = 8077, **kwargs) -> None:
+ if "max_tokens" not in kwargs:
+ kwargs["max_tokens"] = 100
+ self.kwargs = kwargs
+ self.api = VllmAPI(host=host, port=port)
+
+ @property
+ def _identifying_params(self) -> Mapping[str, int]:
+ """Get the identifying parameters."""
+ return {"n": self.n}
+
diff --git a/applications/ColossalQA/colossalqa/local/pangu_llm.py b/applications/ColossalQA/colossalqa/local/pangu_llm.py
new file mode 100644
index 000000000000..b8befa92b96f
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/local/pangu_llm.py
@@ -0,0 +1,150 @@
+"""
+LLM wrapper for Pangu
+
+Usage:
+
+# URL: “盘古大模型套件管理”->点击“服务管理”->“模型列表”->点击想要使用的模型的“复制路径”
+# USERNAME: 华为云控制台:“我的凭证”->“API凭证”下的“IAM用户名”,也就是你登录IAM账户的名字
+# PASSWORD: IAM用户的密码
+# DOMAIN_NAME: 华为云控制台:“我的凭证”->“API凭证”下的“用户名”,也就是公司管理IAM账户的总账户名
+
+os.environ["URL"] = ""
+os.environ["URLNAME"] = ""
+os.environ["PASSWORD"] = ""
+os.environ["DOMAIN_NAME"] = ""
+
+pg = Pangu(id=1)
+pg.set_auth_config()
+
+res = pg('你是谁') # 您好,我是华为盘古大模型。我能够通过和您对话互动为您提供帮助。请问您有什么想问我的吗?
+"""
+
+import http.client
+import json
+from typing import Any, List, Mapping, Optional
+
+import requests
+from langchain.llms.base import LLM
+from langchain.utils import get_from_dict_or_env
+
+
+class Pangu(LLM):
+ """
+ A custom LLM class that integrates pangu models
+
+ """
+
+ n: int
+ gen_config: dict = None
+ auth_config: dict = None
+
+ def __init__(self, gen_config=None, **kwargs):
+ super(Pangu, self).__init__(**kwargs)
+ if gen_config is None:
+ self.gen_config = {"user": "User", "max_tokens": 50, "temperature": 0.95, "n": 1}
+ else:
+ self.gen_config = gen_config
+
+ @property
+ def _identifying_params(self) -> Mapping[str, Any]:
+ """Get the identifying parameters."""
+ return {"n": self.n}
+
+ @property
+ def _llm_type(self) -> str:
+ return "pangu"
+
+ def _call(self, prompt: str, stop: Optional[List[str]] = None, **kwargs) -> str:
+ """
+ Args:
+ prompt: The prompt to pass into the model.
+ stop: A list of strings to stop generation when encountered
+
+ Returns:
+ The string generated by the model
+ """
+ # Update the generation arguments
+ for key, value in kwargs.items():
+ if key in self.gen_config:
+ self.gen_config[key] = value
+
+ response = self.text_completion(prompt, self.gen_config, self.auth_config)
+ text = response["choices"][0]["text"]
+ if stop is not None:
+ for stopping_words in stop:
+ if stopping_words in text:
+ text = text.split(stopping_words)[0]
+ return text
+
+ def set_auth_config(self, **kwargs):
+ url = get_from_dict_or_env(kwargs, "url", "URL")
+ username = get_from_dict_or_env(kwargs, "username", "USERNAME")
+ password = get_from_dict_or_env(kwargs, "password", "PASSWORD")
+ domain_name = get_from_dict_or_env(kwargs, "domain_name", "DOMAIN_NAME")
+
+ region = url.split(".")[1]
+ auth_config = {}
+ auth_config["endpoint"] = url[url.find("https://") + 8 : url.find(".com") + 4]
+ auth_config["resource_path"] = url[url.find(".com") + 4 :]
+ auth_config["auth_token"] = self.get_latest_auth_token(region, username, password, domain_name)
+ self.auth_config = auth_config
+
+ def get_latest_auth_token(self, region, username, password, domain_name):
+ url = f"https://iam.{region}.myhuaweicloud.com/v3/auth/tokens"
+ payload = json.dumps(
+ {
+ "auth": {
+ "identity": {
+ "methods": ["password"],
+ "password": {"user": {"name": username, "password": password, "domain": {"name": domain_name}}},
+ },
+ "scope": {"project": {"name": region}},
+ }
+ }
+ )
+ headers = {"Content-Type": "application/json"}
+
+ response = requests.request("POST", url, headers=headers, data=payload)
+ return response.headers["X-Subject-Token"]
+
+ def text_completion(self, text, gen_config, auth_config):
+ conn = http.client.HTTPSConnection(auth_config["endpoint"])
+ payload = json.dumps(
+ {
+ "prompt": text,
+ "user": gen_config["user"],
+ "max_tokens": gen_config["max_tokens"],
+ "temperature": gen_config["temperature"],
+ "n": gen_config["n"],
+ }
+ )
+ headers = {
+ "X-Auth-Token": auth_config["auth_token"],
+ "Content-Type": "application/json",
+ }
+ conn.request("POST", auth_config["resource_path"], payload, headers)
+ res = conn.getresponse()
+ data = res.read()
+ data = json.loads(data.decode("utf-8"))
+ return data
+
+ def chat_model(self, messages, gen_config, auth_config):
+ conn = http.client.HTTPSConnection(auth_config["endpoint"])
+ payload = json.dumps(
+ {
+ "messages": messages,
+ "user": gen_config["user"],
+ "max_tokens": gen_config["max_tokens"],
+ "temperature": gen_config["temperature"],
+ "n": gen_config["n"],
+ }
+ )
+ headers = {
+ "X-Auth-Token": auth_config["auth_token"],
+ "Content-Type": "application/json",
+ }
+ conn.request("POST", auth_config["resource_path"], payload, headers)
+ res = conn.getresponse()
+ data = res.read()
+ data = json.loads(data.decode("utf-8"))
+ return data
diff --git a/applications/ColossalQA/colossalqa/local/utils.py b/applications/ColossalQA/colossalqa/local/utils.py
new file mode 100644
index 000000000000..ed90264cad8d
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/local/utils.py
@@ -0,0 +1,29 @@
+"""
+Generation utilities
+"""
+import json
+from typing import List
+
+import requests
+
+
+def post_http_request(
+ prompt: str, api_url: str, n: int = 1, max_tokens: int = 100, temperature: float = 0.0, stream: bool = False
+) -> requests.Response:
+ headers = {"User-Agent": "Test Client"}
+ pload = {
+ "prompt": prompt,
+ "n": 1,
+ "use_beam_search": False,
+ "temperature": temperature,
+ "max_tokens": max_tokens,
+ "stream": stream,
+ }
+ response = requests.post(api_url, headers=headers, json=pload, stream=True, timeout=3)
+ return response
+
+
+def get_response(response: requests.Response) -> List[str]:
+ data = json.loads(response.content)
+ output = data["text"]
+ return output
diff --git a/applications/ColossalQA/colossalqa/memory.py b/applications/ColossalQA/colossalqa/memory.py
new file mode 100644
index 000000000000..255df68a367e
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/memory.py
@@ -0,0 +1,168 @@
+"""
+Implement a memory class for storing conversation history
+Support long term and short term memory
+"""
+from typing import Any, Dict, List
+
+from colossalqa.chain.memory.summary import ConversationSummaryMemory
+from colossalqa.chain.retrieval_qa.load_chain import load_qa_chain
+from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
+from langchain.memory.chat_message_histories.in_memory import ChatMessageHistory
+from langchain.schema import BaseChatMessageHistory
+from langchain.schema.messages import BaseMessage
+from langchain.schema.retriever import BaseRetriever
+from pydantic import Field
+
+
+class ConversationBufferWithSummary(ConversationSummaryMemory):
+ """Memory class for storing information about entities."""
+
+ # Define dictionary to store information about entities.
+ # Store the most recent conversation history
+ buffered_history: BaseChatMessageHistory = Field(default_factory=ChatMessageHistory)
+ # Temp buffer
+ summarized_history_temp: BaseChatMessageHistory = Field(default_factory=ChatMessageHistory)
+ human_prefix: str = "Human"
+ ai_prefix: str = "Assistant"
+ buffer: str = "" # Formated conversation in str
+ existing_summary: str = "" # Summarization of stale converstion in str
+ # Define key to pass information about entities into prompt.
+ memory_key: str = "chat_history"
+ input_key: str = "question"
+ retriever: BaseRetriever = None
+ max_tokens: int = 2000
+ chain: BaseCombineDocumentsChain = None
+ input_chain_type_kwargs: List = {}
+
+ @property
+ def buffer(self) -> Any:
+ """String buffer of memory."""
+ return self.buffer_as_messages if self.return_messages else self.buffer_as_str
+
+ @property
+ def buffer_as_str(self) -> str:
+ """Exposes the buffer as a string in case return_messages is True."""
+ self.buffer = self.format_dialogue()
+ return self.buffer
+
+ @property
+ def buffer_as_messages(self) -> List[BaseMessage]:
+ """Exposes the buffer as a list of messages in case return_messages is False."""
+ return self.buffered_history.messages
+
+ def clear(self):
+ """Clear all the memory"""
+ self.buffered_history.clear()
+ self.summarized_history_temp.clear()
+
+ def initiate_document_retrieval_chain(
+ self, llm: Any, prompt_template: Any, retriever: Any, chain_type_kwargs: Dict[str, Any] = {}
+ ) -> None:
+ """
+ Since we need to calculate the length of the prompt, we need to initiate a retrieval chain
+ to calculate the length of the prompt.
+ Args:
+ llm: the language model for the retrieval chain (we won't actually return the output)
+ prompt_template: the prompt template for constructing the retrieval chain
+ retriever: the retriever for the retrieval chain
+ max_tokens: the max length of the prompt (not include the output)
+ chain_type_kwargs: the kwargs for the retrieval chain
+ memory_key: the key for the chat history
+ input_key: the key for the input query
+ """
+ self.retriever = retriever
+ input_chain_type_kwargs = {k: v for k, v in chain_type_kwargs.items() if k not in [self.memory_key]}
+ self.input_chain_type_kwargs = input_chain_type_kwargs
+ self.chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt_template, **self.input_chain_type_kwargs)
+
+ @property
+ def memory_variables(self) -> List[str]:
+ """Define the variables we are providing to the prompt."""
+ return [self.memory_key]
+
+ def format_dialogue(self, lang: str = "en") -> str:
+ """Format memory into two parts--- summarization of historical conversation and most recent conversation"""
+ if len(self.summarized_history_temp.messages) != 0:
+ for i in range(int(len(self.summarized_history_temp.messages) / 2)):
+ self.existing_summary = (
+ self.predict_new_summary(
+ self.summarized_history_temp.messages[i * 2 : i * 2 + 2], self.existing_summary, stop=["\n\n"]
+ )
+ .strip()
+ .split("\n")[0]
+ .strip()
+ )
+ for i in range(int(len(self.summarized_history_temp.messages) / 2)):
+ self.summarized_history_temp.messages.pop(0)
+ self.summarized_history_temp.messages.pop(0)
+ conversation_buffer = []
+ for t in self.buffered_history.messages:
+ if t.type == "human":
+ prefix = self.human_prefix
+ else:
+ prefix = self.ai_prefix
+ conversation_buffer.append(prefix + ": " + t.content)
+ conversation_buffer = "\n".join(conversation_buffer)
+ if len(self.existing_summary) > 0:
+ if lang == "en":
+ message = f"A summarization of historical conversation:\n{self.existing_summary}\nMost recent conversation:\n{conversation_buffer}"
+ elif lang == "zh":
+ message = f"历史对话概要:\n{self.existing_summary}\n最近的对话:\n{conversation_buffer}"
+ else:
+ raise ValueError("Unsupported language")
+ return message
+ else:
+ message = conversation_buffer
+ return message
+
+ def get_conversation_length(self):
+ """Get the length of the formatted conversation"""
+ prompt = self.format_dialogue()
+ length = self.llm.get_num_tokens(prompt)
+ return length
+
+ def load_memory_variables(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+ """Load the memory variables.
+ Summarize oversize conversation to fit into the length constraint defined by max_tokene
+ Args:
+ inputs: the kwargs of the chain of your definition
+ Returns:
+ a dict that maps from memory key to the formated dialogue
+ the formated dialogue has the following format
+ if conversation is too long:
+ A summarization of historical conversation:
+ {summarization}
+ Most recent conversation:
+ Human: XXX
+ Assistant: XXX
+ ...
+ otherwise
+ Human: XXX
+ Assistant: XXX
+ ...
+ """
+ # Calculate remain length
+ if "input_documents" in inputs:
+ # Run in a retrieval qa chain
+ docs = inputs["input_documents"]
+ else:
+ # For test
+ docs = self.retriever.get_relevant_documents(inputs[self.input_key])
+ inputs[self.memory_key] = ""
+ inputs = {k: v for k, v in inputs.items() if k in [self.chain.input_key, self.input_key, self.memory_key]}
+ prompt_length = self.chain.prompt_length(docs, **inputs)
+ remain = self.max_tokens - prompt_length
+ while self.get_conversation_length() > remain:
+ if len(self.buffered_history.messages) <= 2:
+ raise RuntimeError("Exeeed max_tokens, trunck size of retrieved documents is too large")
+ temp = self.buffered_history.messages.pop(0)
+ self.summarized_history_temp.messages.append(temp)
+ temp = self.buffered_history.messages.pop(0)
+ self.summarized_history_temp.messages.append(temp)
+ return {self.memory_key: self.format_dialogue()}
+
+ def save_context(self, inputs: Dict[str, Any], outputs: Dict[str, str]) -> None:
+ """Save context from this conversation to buffer."""
+ input_str, output_str = self._get_input_output(inputs, outputs)
+ self.buffered_history.add_user_message(input_str.strip())
+ self.buffered_history.add_ai_message(output_str.strip())
diff --git a/applications/ColossalQA/colossalqa/mylogging.py b/applications/ColossalQA/colossalqa/mylogging.py
new file mode 100644
index 000000000000..574c33b41685
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/mylogging.py
@@ -0,0 +1,92 @@
+"""
+Class for logging with extra control for debugging
+"""
+import logging
+
+
+class ColossalQALogger:
+ """This is a distributed event logger class essentially based on :class:`logging`.
+
+ Args:
+ name (str): The name of the logger.
+
+ Note:
+ Logging types: ``info``, ``warning``, ``debug`` and ``error``
+ """
+
+ __instances = dict()
+
+ def __init__(self, name):
+ if name in ColossalQALogger.__instances:
+ raise ValueError("Logger with the same name has been created")
+ else:
+ self._name = name
+ self._logger = logging.getLogger(name)
+
+ ColossalQALogger.__instances[name] = self
+
+ @staticmethod
+ def get_instance(name: str):
+ """Get the unique single logger instance based on name.
+
+ Args:
+ name (str): The name of the logger.
+
+ Returns:
+ DistributedLogger: A DistributedLogger object
+ """
+ if name in ColossalQALogger.__instances:
+ return ColossalQALogger.__instances[name]
+ else:
+ logger = ColossalQALogger(name=name)
+ return logger
+
+ def info(self, message: str, verbose: bool = False) -> None:
+ """Log an info message.
+
+ Args:
+ message (str): The message to be logged.
+ verbose (bool): Whether to print the message to stdout.
+ """
+ if verbose:
+ logging.basicConfig(level=logging.INFO)
+ self._logger.info(message)
+
+ def warning(self, message: str, verbose: bool = False) -> None:
+ """Log a warning message.
+
+ Args:
+ message (str): The message to be logged.
+ verbose (bool): Whether to print the message to stdout.
+ """
+ if verbose:
+ self._logger.warning(message)
+
+ def debug(self, message: str, verbose: bool = False) -> None:
+ """Log a debug message.
+
+ Args:
+ message (str): The message to be logged.
+ verbose (bool): Whether to print the message to stdout.
+ """
+ if verbose:
+ self._logger.debug(message)
+
+ def error(self, message: str) -> None:
+ """Log an error message.
+
+ Args:
+ message (str): The message to be logged.
+ """
+ self._logger.error(message)
+
+
+def get_logger(name: str = None, level=logging.INFO) -> ColossalQALogger:
+ """
+ Get the logger by name, if name is None, return the default logger
+ """
+ if name:
+ logger = ColossalQALogger.get_instance(name=name)
+ else:
+ logger = ColossalQALogger.get_instance(name="colossalqa")
+ return logger
diff --git a/applications/ColossalQA/colossalqa/prompt/README.md b/applications/ColossalQA/colossalqa/prompt/README.md
new file mode 100644
index 000000000000..e5c74906b113
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/prompt/README.md
@@ -0,0 +1,144 @@
+# Prompt Design Guide
+
+For the retriever conversation system, users can customize three prompts.
+
+## The Retrieval QA Prompt
+This is the prompt for retrieval QA, the input is user's inputs, the retrieved documents, the historical conversation.
+
+### Chinese
+```
+你是一个善于解答用户问题的AI助手。在保证安全的前提下,回答问题要尽可能有帮助。你的答案不应该包含任何有害的、不道德的、种族主义的、性别歧视的、危险的或非法的内容。请确保你的回答是公正和积极的。
+如果不能根据给定的上下文推断出答案,请不要分享虚假、不确定的信息。
+使用提供的背景信息和聊天记录对用户的输入作出回应或继续对话。您应该只生成一个回复。不需要跟进回答。请使用中文作答。
+
+背景信息:
+[retrieved documents]
+
+聊天记录:
+[historical conversation, overlength chat history will be summarized]
+
+用户: [question]
+Assistant:
+```
+
+### English
+```
+[INST] <>Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If the answer cannot be infered based on the given context, please don't share false information.<>
+Use the context and chat history to respond to the human's input at the end or carry on the conversation. You should generate one response only. No following up is needed.
+
+context:
+[retrieved documents]
+
+chat history
+[historical conversation, overlength chat history will be summarized]
+
+Human: {question}
+Assistant:
+```
+
+## Summarization Prompt
+This prompt is used by the memory module to recursively summarize overlength conversation to shrink the length of the prompt.
+
+## Disambiguity Prompt
+This prompt is used to perform zero-shot reference resolution to disambiguate entity references within user's questions.
+
+## Final Prompt Examples
+Assume k=3 for the retriever.
+
+### English
+Note that the "[INST] <>...<>" template is the specific prompt format used in LLaMA2.
+#### Normal Length
+```
+[INST] <>Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If the answer cannot be infered based on the given context, please don't share false information.<>
+Use the context and chat history to respond to the human's input at the end or carry on the conversation. You should generate one response only. No following up is needed.
+
+context:
+[document 1]
+
+[document 2]
+
+[document 3]
+
+chat history
+Human: XXX
+Assistant: XXX
+...
+
+Human: {question}
+Assistant:
+```
+
+#### Overlength
+```
+[INST] <>Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If the answer cannot be infered based on the given context, please don't share false information.<>
+Use the context and chat history to respond to the human's input at the end or carry on the conversation. You should generate one response only. No following up is needed.
+
+context:
+[document 1]
+
+[document 2]
+
+[document 3]
+
+chat history
+A summarization of historical conversation:
+[one line summary of historical conversation]
+Most recent conversation:
+Human: XXX
+Assistant: XXX
+...
+
+Human: {question}
+Assistant:
+```
+
+### Chinese
+#### Normal Length
+```
+你是一个善于解答用户问题的AI助手。在保证安全的前提下,回答问题要尽可能有帮助。你的答案不应该包含任何有害的、不道德的、种族主义的、性别歧视的、危险的或非法的内容。请确保你的回答是公正和积极的。
+如果不能根据给定的上下文推断出答案,请不要分享虚假、不确定的信息。
+使用提供的背景信息和聊天记录对用户的输入作出回应或继续对话。您应该只生成一个回复。不需要跟进回答。请使用中文作答。
+
+背景信息:
+[document 1]
+
+[document 2]
+
+[document 3]
+
+聊天记录:
+用户: XXX
+Assistant: XXX
+...
+
+用户: [question]
+Assistant:
+```
+
+#### Overlength
+```
+你是一个善于解答用户问题的AI助手。在保证安全的前提下,回答问题要尽可能有帮助。你的答案不应该包含任何有害的、不道德的、种族主义的、性别歧视的、危险的或非法的内容。请确保你的回答是公正和积极的。
+如果不能根据给定的上下文推断出答案,请不要分享虚假、不确定的信息。
+使用提供的背景信息和聊天记录对用户的输入作出回应或继续对话。您应该只生成一个回复。不需要跟进回答。请使用中文作答。
+
+背景信息:
+[document 1]
+
+[document 2]
+
+[document 3]
+
+聊天记录:
+历史对话概要:
+[one line summary of historical conversation]
+最近的对话:
+用户: XXX
+Assistant: XXX
+...
+
+用户: [question]
+Assistant:
+```
diff --git a/applications/ColossalQA/colossalqa/prompt/prompt.py b/applications/ColossalQA/colossalqa/prompt/prompt.py
new file mode 100644
index 000000000000..a7723078689e
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/prompt/prompt.py
@@ -0,0 +1,124 @@
+"""
+All custom prompt templates are defined here.
+"""
+
+from langchain.prompts.prompt import PromptTemplate
+
+_CUSTOM_SUMMARIZER_TEMPLATE_ZH = """请递进式地总结所提供的当前对话,将当前对话的摘要内容添加到先前已有的摘要上,返回一个融合了当前对话的新的摘要。
+
+例1:
+已有的摘要:
+人类问Assistant对人工智能的看法。人工智能认为人工智能是一种善的力量。
+
+新的对话内容:
+人类: 为什么你认为人工智能是一种好的力量?
+Assistant: 因为人工智能将帮助人类充分发挥潜力。
+
+新的摘要:
+人类问Assistant对人工智能的看法。人工智能认为人工智能是一种积极的力量,因为它将帮助人类充分发挥潜力。
+示例结束
+
+已有的摘要:
+{summary}
+
+新的对话内容:
+{new_lines}
+
+新的摘要:"""
+
+
+# Chinese retrieval qa prompt
+
+_ZH_RETRIEVAL_QA_PROMPT = """<指令>根据下列支持文档和对话历史,简洁和专业地来回答问题。如果无法从支持文档中得到答案,请说 “根据已知信息无法回答该问题”。回答中请不要涉及支持文档中没有提及的信息,答案请使用中文。 指令>
+
+{context}
+
+<对话历史>
+{chat_history}
+对话历史>
+
+<问题>{question}问题>
+答案:"""
+
+ZH_RETRIEVAL_QA_TRIGGER_KEYWORDS = ["无法回答该问题"]
+ZH_RETRIEVAL_QA_REJECTION_ANSWER = "抱歉,根据提供的信息无法回答该问题。"
+
+
+_ZH_RETRIEVAL_CLASSIFICATION_USE_CASE = """使用提供的参考案例判断客户遇到的故障所属的故障原因分类。
+
+背景信息:
+{context}
+
+客服记录:
+{question}
+故障原因分类:"""
+
+_ZH_DISAMBIGUATION_PROMPT = """你是一个乐于助人、恭敬而诚实的助手。你总是按照指示去做。
+请用聊天记录中提到的具体名称或实体名称替换给定句子中的任何模糊或有歧义的指代,如果没有提供聊天记录或句子中不包含模糊或有歧义的指代,则只输出原始句子。您的输出应该是消除歧义的句子本身(与“消除歧义的句子:”在同一行中),并且不包含任何其他内容。
+
+下面是一个例子:
+聊天记录:
+用户: 我有一个朋友,张三。你认识他吗?
+Assistant: 我认识一个叫张三的人
+
+句子: 他最喜欢的食物是什么?
+消除歧义的句子: 张三最喜欢的食物是什么?
+
+聊天记录:
+{chat_history}
+
+句子: {input}
+消除歧义的句子:"""
+
+# English retrieval qa prompt
+
+_EN_RETRIEVAL_QA_PROMPT = """[INST] <>Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist content.
+If the answer cannot be infered based on the given context, please say "I cannot answer the question based on the information given.".<>
+Use the context and chat history to answer the question.
+
+context:
+{context}
+
+chat history
+{chat_history}
+
+question: {question}
+answer:"""
+EN_RETRIEVAL_QA_TRIGGER_KEYWORDS = ["cannot answer the question"]
+EN_RETRIEVAL_QA_REJECTION_ANSWER = "Sorry, this question cannot be answered based on the information provided."
+
+_EN_DISAMBIGUATION_PROMPT = """[INST] <>You are a helpful, respectful and honest assistant. You always follow the instruction.<>
+Please replace any ambiguous references in the given sentence with the specific names or entities mentioned in the chat history or just output the original sentence if no chat history is provided or if the sentence doesn't contain ambiguous references. Your output should be the disambiguated sentence itself (in the same line as "disambiguated sentence:") and contain nothing else.
+
+Here is an example:
+Chat history:
+Human: I have a friend, Mike. Do you know him?
+Assistant: Yes, I know a person named Mike
+
+sentence: What's his favorate food?
+disambiguated sentence: What's Mike's favorate food?
+[/INST]
+Chat history:
+{chat_history}
+
+sentence: {input}
+disambiguated sentence:"""
+
+
+PROMPT_RETRIEVAL_QA_EN = PromptTemplate(
+ template=_EN_RETRIEVAL_QA_PROMPT, input_variables=["question", "chat_history", "context"]
+)
+
+PROMPT_DISAMBIGUATE_EN = PromptTemplate(template=_EN_DISAMBIGUATION_PROMPT, input_variables=["chat_history", "input"])
+
+SUMMARY_PROMPT_ZH = PromptTemplate(input_variables=["summary", "new_lines"], template=_CUSTOM_SUMMARIZER_TEMPLATE_ZH)
+
+PROMPT_DISAMBIGUATE_ZH = PromptTemplate(template=_ZH_DISAMBIGUATION_PROMPT, input_variables=["chat_history", "input"])
+
+PROMPT_RETRIEVAL_QA_ZH = PromptTemplate(
+ template=_ZH_RETRIEVAL_QA_PROMPT, input_variables=["question", "chat_history", "context"]
+)
+
+PROMPT_RETRIEVAL_CLASSIFICATION_USE_CASE_ZH = PromptTemplate(
+ template=_ZH_RETRIEVAL_CLASSIFICATION_USE_CASE, input_variables=["question", "context"]
+)
diff --git a/applications/ColossalQA/colossalqa/retrieval_conversation_en.py b/applications/ColossalQA/colossalqa/retrieval_conversation_en.py
new file mode 100644
index 000000000000..d2626321d68d
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/retrieval_conversation_en.py
@@ -0,0 +1,87 @@
+"""
+Script for Chinese retrieval based conversation system backed by ChatGLM
+"""
+from typing import Tuple
+
+from colossalqa.chain.retrieval_qa.base import RetrievalQA
+from colossalqa.local.llm import ColossalAPI, ColossalLLM
+from colossalqa.memory import ConversationBufferWithSummary
+from colossalqa.mylogging import get_logger
+from colossalqa.prompt.prompt import PROMPT_DISAMBIGUATE_EN, PROMPT_RETRIEVAL_QA_EN
+from colossalqa.retriever import CustomRetriever
+from langchain import LLMChain
+
+logger = get_logger()
+
+
+class EnglishRetrievalConversation:
+ """
+ Wrapper class for Chinese retrieval conversation system
+ """
+
+ def __init__(self, retriever: CustomRetriever, model_path: str, model_name: str) -> None:
+ """
+ Setup retrieval qa chain for Chinese retrieval based QA
+ """
+ logger.info(f"model_name: {model_name}; model_path: {model_path}", verbose=True)
+ colossal_api = ColossalAPI.get_api(model_name, model_path)
+ self.llm = ColossalLLM(n=1, api=colossal_api)
+
+ # Define the retriever
+ self.retriever = retriever
+
+ # Define the chain to preprocess the input
+ # Disambiguate the input. e.g. "What is the capital of that country?" -> "What is the capital of France?"
+ # Prompt is summarization prompt
+ self.llm_chain_disambiguate = LLMChain(
+ llm=self.llm,
+ prompt=PROMPT_DISAMBIGUATE_EN,
+ llm_kwargs={"max_new_tokens": 30, "temperature": 0.6, "do_sample": True},
+ )
+
+ self.retriever.set_rephrase_handler(self.disambiguity)
+ # Define memory with summarization ability
+ self.memory = ConversationBufferWithSummary(
+ llm=self.llm, llm_kwargs={"max_new_tokens": 50, "temperature": 0.6, "do_sample": True}
+ )
+ self.memory.initiate_document_retrieval_chain(
+ self.llm,
+ PROMPT_RETRIEVAL_QA_EN,
+ self.retriever,
+ chain_type_kwargs={
+ "chat_history": "",
+ },
+ )
+ self.retrieval_chain = RetrievalQA.from_chain_type(
+ llm=self.llm,
+ verbose=False,
+ chain_type="stuff",
+ retriever=self.retriever,
+ chain_type_kwargs={"prompt": PROMPT_RETRIEVAL_QA_EN, "memory": self.memory},
+ llm_kwargs={"max_new_tokens": 50, "temperature": 0.75, "do_sample": True},
+ )
+
+ def disambiguity(self, input: str):
+ out = self.llm_chain_disambiguate.run(input=input, chat_history=self.memory.buffer, stop=["\n"])
+ return out.split("\n")[0]
+
+ @classmethod
+ def from_retriever(
+ cls, retriever: CustomRetriever, model_path: str, model_name: str
+ ) -> "EnglishRetrievalConversation":
+ return cls(retriever, model_path, model_name)
+
+ def run(self, user_input: str, memory: ConversationBufferWithSummary) -> Tuple[str, ConversationBufferWithSummary]:
+ if memory:
+ # TODO add translation chain here
+ self.memory.buffered_history.messages = memory.buffered_history.messages
+ self.memory.summarized_history_temp.messages = memory.summarized_history_temp.messages
+ return (
+ self.retrieval_chain.run(
+ query=user_input,
+ stop=[self.memory.human_prefix + ": "],
+ rejection_trigger_keywrods=["cannot answer the question"],
+ rejection_answer="Sorry, this question cannot be answered based on the information provided.",
+ ).split("\n")[0],
+ self.memory,
+ )
diff --git a/applications/ColossalQA/colossalqa/retrieval_conversation_universal.py b/applications/ColossalQA/colossalqa/retrieval_conversation_universal.py
new file mode 100644
index 000000000000..76bec715fb6e
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/retrieval_conversation_universal.py
@@ -0,0 +1,138 @@
+"""
+Multilingual retrieval based conversation system
+"""
+from typing import List
+
+from colossalqa.data_loader.document_loader import DocumentLoader
+from colossalqa.mylogging import get_logger
+from colossalqa.retrieval_conversation_en import EnglishRetrievalConversation
+from colossalqa.retrieval_conversation_zh import ChineseRetrievalConversation
+from colossalqa.retriever import CustomRetriever
+from colossalqa.text_splitter import ChineseTextSplitter
+from colossalqa.utils import detect_lang_naive
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
+
+logger = get_logger()
+
+
+class UniversalRetrievalConversation:
+ """
+ Wrapper class for bilingual retrieval conversation system
+ """
+
+ def __init__(
+ self,
+ embedding_model_path: str = "moka-ai/m3e-base",
+ embedding_model_device: str = "cpu",
+ zh_model_path: str = None,
+ zh_model_name: str = None,
+ en_model_path: str = None,
+ en_model_name: str = None,
+ sql_file_path: str = None,
+ files_zh: List[List[str]] = None,
+ files_en: List[List[str]] = None,
+ text_splitter_chunk_size=100,
+ text_splitter_chunk_overlap=10,
+ ) -> None:
+ """
+ Warpper for multilingual retrieval qa class (Chinese + English)
+ Args:
+ embedding_model_path: local or huggingface embedding model
+ embedding_model_device:
+ files_zh: [[file_path, name_of_file, separator],...] defines the files used as supporting documents for Chinese retrieval QA
+ files_en: [[file_path, name_of_file, separator],...] defines the files used as supporting documents for English retrieval QA
+ """
+ self.embedding = HuggingFaceEmbeddings(
+ model_name=embedding_model_path,
+ model_kwargs={"device": embedding_model_device},
+ encode_kwargs={"normalize_embeddings": False},
+ )
+ print("Select files for constructing Chinese retriever")
+ docs_zh = self.load_supporting_docs(
+ files=files_zh,
+ text_splitter=ChineseTextSplitter(
+ chunk_size=text_splitter_chunk_size, chunk_overlap=text_splitter_chunk_overlap
+ ),
+ )
+ # Create retriever
+ self.information_retriever_zh = CustomRetriever(
+ k=3, sql_file_path=sql_file_path.replace(".db", "_zh.db"), verbose=True
+ )
+ self.information_retriever_zh.add_documents(
+ docs=docs_zh, cleanup="incremental", mode="by_source", embedding=self.embedding
+ )
+
+ print("Select files for constructing English retriever")
+ docs_en = self.load_supporting_docs(
+ files=files_en,
+ text_splitter=RecursiveCharacterTextSplitter(
+ chunk_size=text_splitter_chunk_size, chunk_overlap=text_splitter_chunk_overlap
+ ),
+ )
+ # Create retriever
+ self.information_retriever_en = CustomRetriever(
+ k=3, sql_file_path=sql_file_path.replace(".db", "_en.db"), verbose=True
+ )
+ self.information_retriever_en.add_documents(
+ docs=docs_en, cleanup="incremental", mode="by_source", embedding=self.embedding
+ )
+
+ self.chinese_retrieval_conversation = ChineseRetrievalConversation.from_retriever(
+ self.information_retriever_zh, model_path=zh_model_path, model_name=zh_model_name
+ )
+ self.english_retrieval_conversation = EnglishRetrievalConversation.from_retriever(
+ self.information_retriever_en, model_path=en_model_path, model_name=en_model_name
+ )
+ self.memory = None
+
+ def load_supporting_docs(self, files: List[List[str]] = None, text_splitter: TextSplitter = None):
+ """
+ Load supporting documents, currently, all documents will be stored in one vector store
+ """
+ documents = []
+ if files:
+ for file in files:
+ retriever_data = DocumentLoader([[file["data_path"], file["name"]]]).all_data
+ splits = text_splitter.split_documents(retriever_data)
+ documents.extend(splits)
+ else:
+ while True:
+ file = input("Select a file to load or press Enter to exit:")
+ if file == "":
+ break
+ data_name = input("Enter a short description of the data:")
+ separator = input(
+ "Enter a separator to force separating text into chunks, if no separator is given, the defaut separator is '\\n\\n', press ENTER directly to skip:"
+ )
+ separator = separator if separator != "" else "\n\n"
+ retriever_data = DocumentLoader([[file, data_name.replace(" ", "_")]]).all_data
+
+ # Split
+ splits = text_splitter.split_documents(retriever_data)
+ documents.extend(splits)
+ return documents
+
+ def start_test_session(self):
+ """
+ Simple multilingual session for testing purpose, with naive language selection mechanism
+ """
+ while True:
+ user_input = input("User: ")
+ lang = detect_lang_naive(user_input)
+ if "END" == user_input:
+ print("Agent: Happy to chat with you :)")
+ break
+ agent_response = self.run(user_input, which_language=lang)
+ print(f"Agent: {agent_response}")
+
+ def run(self, user_input: str, which_language=str):
+ """
+ Generate the response given the user input and a str indicates the language requirement of the output string
+ """
+ assert which_language in ["zh", "en"]
+ if which_language == "zh":
+ agent_response, self.memory = self.chinese_retrieval_conversation.run(user_input, self.memory)
+ else:
+ agent_response, self.memory = self.english_retrieval_conversation.run(user_input, self.memory)
+ return agent_response.split("\n")[0]
diff --git a/applications/ColossalQA/colossalqa/retrieval_conversation_zh.py b/applications/ColossalQA/colossalqa/retrieval_conversation_zh.py
new file mode 100644
index 000000000000..484be21c1553
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/retrieval_conversation_zh.py
@@ -0,0 +1,94 @@
+"""
+Script for Chinese retrieval based conversation system backed by ChatGLM
+"""
+from typing import Tuple
+
+from colossalqa.chain.retrieval_qa.base import RetrievalQA
+from colossalqa.local.llm import ColossalAPI, ColossalLLM
+from colossalqa.memory import ConversationBufferWithSummary
+from colossalqa.mylogging import get_logger
+from colossalqa.prompt.prompt import PROMPT_DISAMBIGUATE_ZH, PROMPT_RETRIEVAL_QA_ZH, SUMMARY_PROMPT_ZH
+from colossalqa.retriever import CustomRetriever
+from langchain import LLMChain
+
+logger = get_logger()
+
+
+class ChineseRetrievalConversation:
+ """
+ Wrapper class for Chinese retrieval conversation system
+ """
+
+ def __init__(self, retriever: CustomRetriever, model_path: str, model_name: str) -> None:
+ """
+ Setup retrieval qa chain for Chinese retrieval based QA
+ """
+ # Local coati api
+ logger.info(f"model_name: {model_name}; model_path: {model_path}", verbose=True)
+ colossal_api = ColossalAPI.get_api(model_name, model_path)
+ self.llm = ColossalLLM(n=1, api=colossal_api)
+
+ # Define the retriever
+ self.retriever = retriever
+
+ # Define the chain to preprocess the input
+ # Disambiguate the input. e.g. "What is the capital of that country?" -> "What is the capital of France?"
+ # Prompt is summarization prompt
+ self.llm_chain_disambiguate = LLMChain(
+ llm=self.llm,
+ prompt=PROMPT_DISAMBIGUATE_ZH,
+ llm_kwargs={"max_new_tokens": 30, "temperature": 0.6, "do_sample": True},
+ )
+
+ self.retriever.set_rephrase_handler(self.disambiguity)
+ # Define memory with summarization ability
+ self.memory = ConversationBufferWithSummary(
+ llm=self.llm,
+ prompt=SUMMARY_PROMPT_ZH,
+ human_prefix="用户",
+ ai_prefix="Assistant",
+ max_tokens=2000,
+ llm_kwargs={"max_new_tokens": 50, "temperature": 0.6, "do_sample": True},
+ )
+ self.memory.initiate_document_retrieval_chain(
+ self.llm,
+ PROMPT_RETRIEVAL_QA_ZH,
+ self.retriever,
+ chain_type_kwargs={
+ "chat_history": "",
+ },
+ )
+ self.retrieval_chain = RetrievalQA.from_chain_type(
+ llm=self.llm,
+ verbose=False,
+ chain_type="stuff",
+ retriever=self.retriever,
+ chain_type_kwargs={"prompt": PROMPT_RETRIEVAL_QA_ZH, "memory": self.memory},
+ llm_kwargs={"max_new_tokens": 150, "temperature": 0.9, "do_sample": True},
+ )
+
+ def disambiguity(self, input: str):
+ out = self.llm_chain_disambiguate.run(input=input, chat_history=self.memory.buffer, stop=["\n"])
+ return out.split("\n")[0]
+
+ @classmethod
+ def from_retriever(
+ cls, retriever: CustomRetriever, model_path: str, model_name: str
+ ) -> "ChineseRetrievalConversation":
+ return cls(retriever, model_path, model_name)
+
+ def run(self, user_input: str, memory: ConversationBufferWithSummary) -> Tuple[str, ConversationBufferWithSummary]:
+ if memory:
+ # TODO add translation chain here
+ self.memory.buffered_history.messages = memory.buffered_history.messages
+ self.memory.summarized_history_temp.messages = memory.summarized_history_temp.messages
+ return (
+ self.retrieval_chain.run(
+ query=user_input,
+ stop=["答案>"],
+ doc_prefix="支持文档",
+ rejection_trigger_keywrods=["无法回答该问题"],
+ rejection_answer="抱歉,根据提供的信息无法回答该问题。",
+ ).split("\n")[0],
+ self.memory,
+ )
diff --git a/applications/ColossalQA/colossalqa/retriever.py b/applications/ColossalQA/colossalqa/retriever.py
new file mode 100644
index 000000000000..9ea6d5b080cd
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/retriever.py
@@ -0,0 +1,166 @@
+"""
+Code for custom retriver with incremental update
+"""
+import copy
+import hashlib
+import os
+from collections import defaultdict
+from typing import Any, Callable, Dict, List
+
+from colossalqa.mylogging import get_logger
+from langchain.callbacks.manager import CallbackManagerForRetrieverRun
+from langchain.embeddings.base import Embeddings
+from langchain.indexes import SQLRecordManager, index
+from langchain.schema.retriever import BaseRetriever, Document
+from langchain.vectorstores.base import VectorStore
+from langchain.vectorstores.chroma import Chroma
+
+logger = get_logger()
+
+
+class CustomRetriever(BaseRetriever):
+ """
+ Custom retriever class with support for incremental update of indexes
+ """
+
+ vector_stores: Dict[str, VectorStore] = {}
+ sql_index_database: Dict[str, str] = {}
+ record_managers: Dict[str, SQLRecordManager] = {}
+ sql_db_chains = []
+ k = 3
+ rephrase_handler: Callable = None
+ buffer: Dict = []
+ buffer_size: int = 5
+ verbose: bool = False
+ sql_file_path: str = None
+
+ @classmethod
+ def from_documents(
+ cls,
+ documents: List[Document],
+ embeddings: Embeddings,
+ **kwargs: Any,
+ ) -> BaseRetriever:
+ k = kwargs.pop("k", 3)
+ cleanup = kwargs.pop("cleanup", "incremental")
+ mode = kwargs.pop("mode", "by_source")
+ ret = cls(k=k)
+ ret.add_documents(documents, embedding=embeddings, cleanup=cleanup, mode=mode)
+ return ret
+
+ def add_documents(
+ self,
+ docs: Dict[str, Document] = [],
+ cleanup: str = "incremental",
+ mode: str = "by_source",
+ embedding: Embeddings = None,
+ ) -> None:
+ """
+ Add documents to retriever
+ Args:
+ docs: the documents to add
+ cleanup: choose from "incremental" (update embeddings, skip existing embeddings) and "full" (destory and rebuild retriever)
+ mode: choose from "by source" (documents are grouped by source) and "merge" (documents are merged into one vector store)
+ """
+ if cleanup == "full":
+ # Cleanup
+ for source in self.vector_stores:
+ os.remove(self.sql_index_database[source])
+ # Add documents
+ data_by_source = defaultdict(list)
+ if mode == "by_source":
+ for doc in docs:
+ data_by_source[doc.metadata["source"]].append(doc)
+ elif mode == "merge":
+ data_by_source["merged"] = docs
+ for source in data_by_source:
+ if source not in self.vector_stores:
+ hash_encoding = hashlib.sha3_224(source.encode()).hexdigest()
+ if os.path.exists(f"{self.sql_file_path}/{hash_encoding}.db"):
+ # Remove the stale file
+ os.remove(f"{self.sql_file_path}/{hash_encoding}.db")
+ # Create a new sql database to store indexes, sql files are stored in the same directory as the source file
+ sql_path = f"sqlite:///{self.sql_file_path}/{hash_encoding}.db"
+ self.vector_stores[source] = Chroma(embedding_function=embedding, collection_name=hash_encoding)
+ self.sql_index_database[source] = f"{self.sql_file_path}/{hash_encoding}.db"
+ self.record_managers[source] = SQLRecordManager(source, db_url=sql_path)
+ self.record_managers[source].create_schema()
+ index(
+ data_by_source[source],
+ self.record_managers[source],
+ self.vector_stores[source],
+ cleanup=cleanup,
+ source_id_key="source",
+ )
+
+ def __del__(self):
+ for source in self.sql_index_database:
+ if os.path.exists(self.sql_index_database[source]):
+ os.remove(self.sql_index_database[source])
+
+ def set_sql_database_chain(self, db_chains) -> None:
+ """
+ set sql agent chain to retrieve information from sql database
+ Not used in this version
+ """
+ self.sql_db_chains = db_chains
+
+ def set_rephrase_handler(self, handler: Callable = None) -> None:
+ """
+ Set a handler to preprocess the input str before feed into the retriever
+ """
+ self.rephrase_handler = handler
+
+ def _get_relevant_documents(
+ self,
+ query: str,
+ *,
+ run_manager: CallbackManagerForRetrieverRun = None,
+ score_threshold: float = None,
+ return_scores: bool = False,
+ ) -> List[Document]:
+ """
+ This function is called by the retriever to get the relevant documents.
+ recent vistied queries are stored in buffer, if the query is in buffer, return the documents directly
+
+ Args:
+ query: the query to be searched
+ run_manager: the callback manager for retriever run
+ Returns:
+ documents: the relevant documents
+ """
+ for buffered_doc in self.buffer:
+ if buffered_doc[0] == query:
+ return buffered_doc[1]
+ query_ = str(query)
+ # Use your existing retriever to get the documents
+ if self.rephrase_handler:
+ query = self.rephrase_handler(query)
+ documents = []
+ for k in self.vector_stores:
+ # Retrieve documents from each retriever
+ vectorstore = self.vector_stores[k]
+ documents.extend(vectorstore.similarity_search_with_score(query, self.k, score_threshold=score_threshold))
+ # print(documents)
+ # Return the top k documents among all retrievers
+ documents = sorted(documents, key=lambda x: x[1], reverse=False)[: self.k]
+ if return_scores:
+ # Return score
+ documents = copy.deepcopy(documents)
+ for doc in documents:
+ doc[0].metadata["score"] = doc[1]
+ documents = [doc[0] for doc in documents]
+ # Retrieve documents from sql database (not applicable for the local chains)
+ for sql_chain in self.sql_db_chains:
+ documents.append(
+ Document(
+ page_content=f"Query: {query} Answer: {sql_chain.run(query)}", metadata={"source": "sql_query"}
+ )
+ )
+ if len(self.buffer) < self.buffer_size:
+ self.buffer.append([query_, documents])
+ else:
+ self.buffer.pop(0)
+ self.buffer.append([query_, documents])
+ logger.info(f"retrieved documents:\n{str(documents)}", verbose=self.verbose)
+ return documents
diff --git a/applications/ColossalQA/colossalqa/text_splitter/__init__.py b/applications/ColossalQA/colossalqa/text_splitter/__init__.py
new file mode 100644
index 000000000000..b56fdfe8d582
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/text_splitter/__init__.py
@@ -0,0 +1 @@
+from .chinese_text_splitter import ChineseTextSplitter
diff --git a/applications/ColossalQA/colossalqa/text_splitter/chinese_text_splitter.py b/applications/ColossalQA/colossalqa/text_splitter/chinese_text_splitter.py
new file mode 100644
index 000000000000..3815f5ed2621
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/text_splitter/chinese_text_splitter.py
@@ -0,0 +1,56 @@
+"""
+Code for Chinese text splitter
+"""
+from typing import Any, List, Optional
+
+from colossalqa.text_splitter.utils import get_cleaned_paragraph
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+
+class ChineseTextSplitter(RecursiveCharacterTextSplitter):
+ def __init__(self, separators: Optional[List[str]] = None, is_separator_regrx: bool = False, **kwargs: Any):
+ self._separators = separators or ["\n\n", "\n", ",", "。", "!", "?", "?"]
+ if "chunk_size" not in kwargs:
+ kwargs["chunk_size"] = 50
+ if "chunk_overlap" not in kwargs:
+ kwargs["chunk_overlap"] = 10
+ super().__init__(separators=separators, keep_separator=True, **kwargs)
+ self._is_separator_regex = is_separator_regrx
+
+ def split_text(self, text: str) -> List[str]:
+ """Return the list of separated text chunks"""
+ cleaned_paragraph = get_cleaned_paragraph(text)
+ splitted = []
+ for paragraph in cleaned_paragraph:
+ segs = super().split_text(paragraph)
+ for i in range(len(segs) - 1):
+ if segs[i][-1] not in self._separators:
+ pos = text.find(segs[i])
+ pos_end = pos + len(segs[i])
+ if i > 0:
+ last_sentence_start = max([text.rfind(m, 0, pos) for m in ["。", "!", "?"]])
+ pos = last_sentence_start + 1
+ segs[i] = str(text[pos:pos_end])
+ if i != len(segs) - 1:
+ next_sentence_end = max([text.find(m, pos_end) for m in ["。", "!", "?"]])
+ segs[i] = str(text[pos : next_sentence_end + 1])
+ splitted.append(segs[i])
+ if len(splitted) <= 1:
+ return splitted
+ splitted_text = []
+ i = 1
+ if splitted[0] not in splitted[1]:
+ splitted_text.append([splitted[0], 0])
+ if splitted[-1] not in splitted[-2]:
+ splitted_text.append([splitted[-1], len(splitted) - 1])
+ while i < len(splitted) - 1:
+ if splitted[i] not in splitted[i + 1] and splitted[i] not in splitted[i - 1]:
+ splitted_text.append([splitted[i], i])
+ i += 1
+ splitted_text = sorted(splitted_text, key=lambda x: x[1])
+ splitted_text = [splitted_text[i][0] for i in range(len(splitted_text))]
+ ret = []
+ for s in splitted_text:
+ if s not in ret:
+ ret.append(s)
+ return ret
diff --git a/applications/ColossalQA/colossalqa/text_splitter/utils.py b/applications/ColossalQA/colossalqa/text_splitter/utils.py
new file mode 100644
index 000000000000..250b46d9742a
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/text_splitter/utils.py
@@ -0,0 +1,19 @@
+import re
+
+
+def remove_format(text: str) -> str:
+ # if the accout of \t, \r, \v, \f is less than 3, replace \t, \r, \v, \f with space
+ if len(re.findall(r"\s", text.replace(" ", ""))) > 3:
+ # in case this is a line of a table
+ return text
+ return re.sub(r"\s", " ", text)
+
+
+# remove newlines
+def get_cleaned_paragraph(s: str) -> str:
+ text = str(s)
+ text = re.sub(r"\n{3,}", r"\n", text) # replace \n\n\n... with \n
+ text = re.sub("\n\n", "", text)
+ lines = text.split("\n")
+ lines_remove_format = [remove_format(line) for line in lines]
+ return lines_remove_format
diff --git a/applications/ColossalQA/colossalqa/utils.py b/applications/ColossalQA/colossalqa/utils.py
new file mode 100644
index 000000000000..cd8c3e5acec8
--- /dev/null
+++ b/applications/ColossalQA/colossalqa/utils.py
@@ -0,0 +1,61 @@
+import re
+from typing import Union
+
+from colossalqa.mylogging import get_logger
+from sqlalchemy import Engine, MetaData, create_engine
+from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.ext.declarative import declarative_base
+
+logger = get_logger()
+
+
+def drop_table(engine: Engine) -> None:
+ """
+ Drop all existing table
+ """
+ Base = declarative_base()
+ metadata = MetaData()
+ metadata.reflect(bind=engine)
+ for key in metadata.tables:
+ table = metadata.tables[key]
+ if table is not None:
+ Base.metadata.drop_all(engine, [table], checkfirst=True)
+
+
+def create_empty_sql_database(database_uri):
+ try:
+ # Create an SQLAlchemy engine to connect to the database
+ engine = create_engine(database_uri)
+
+ # Create the database
+ engine.connect()
+
+ logger.info(f"Database created at {database_uri}")
+ except SQLAlchemyError as e:
+ logger.error(f"Error creating database: {str(e)}")
+ return engine, database_uri
+
+
+def destroy_sql_database(sql_engine: Union[Engine, str]) -> None:
+ """
+ Destroy an sql database
+ """
+ if isinstance(sql_engine, str):
+ sql_engine = create_engine(sql_engine)
+ drop_table(sql_engine)
+ sql_engine.dispose()
+ sql_engine = None
+
+
+def detect_lang_naive(s):
+ """
+ Naive function for language detection, should be replaced by an independant layer
+ """
+ remove_nota = "[’·°–!\"#$%&'()*+,-./:;<=>?@,。?★、…【】()《》?“”‘’![\\]^_`{|}~]+"
+ s = re.sub(remove_nota, "", s)
+ s = re.sub("[0-9]", "", s).strip()
+ res = re.sub("[a-zA-Z]", "", s).strip()
+ if len(res) <= 0:
+ return "en"
+ else:
+ return "zh"
diff --git a/applications/ColossalQA/data/data_sample/companies.txt b/applications/ColossalQA/data/data_sample/companies.txt
new file mode 100644
index 000000000000..05c6148f18a5
--- /dev/null
+++ b/applications/ColossalQA/data/data_sample/companies.txt
@@ -0,0 +1,6 @@
+Overview The Straits Times is the English flagship daily of SPH Media, one of the leading media companies in Asia. Launched on July 15, 1845, its comprehensive coverage of news from home and around the world makes The Straits Times the most-read newspaper in Singapore. Quality news, in-depth analyses, impactful commentaries and breaking stories are packaged to give readers riveting accounts of events in Singapore, the region, and beyond. The most read newspaper in Singapore, both in terms of print and digital, it reaches 1.33 million people every day. The Straits Times' key strength is in its world class coverage of news outside Singapore. With 20 bureaus in major cities around the world, The Straits Times correspondents bring world news to readers on a Singapore platter, helping readers to appreciate world events from a Singaporean perspective. Website http://www.straitstimes.com Phone 63196319Phone number is 63196319 Industry Newspaper Publishing Company size 1,001-5,000 employees 183 on LinkedIn Includes members with current employer listed as The Straits Times, including part-time roles. Headquarters Singapore, Singapore Founded 1845 Specialties News and Digital media
+About With over 500 properties worldwide, Marriott Hotels has reimagined hospitality to exceed the expectations of business, group, and leisure travelers.
+Marriott Hotels, Marriott’s flagship brand of quality-tier, full-service hotels and resorts, provides consistent, dependable and genuinely caring experiences to guests on their terms. Marriott is a brilliant host to guests who effortlessly blend life and work, and who are inspired by how modern travel enhances them both. Our hotels offer warm, professional service; sophisticated yet functional guest room design; lobby spaces that facilitate working, dining and socializing; restaurants and bars serving international cuisine prepared simply and from the freshest ingredients; meeting and event spaces and services that are gold standard; and expansive, 24-hour fitness facilities.
+Overview AERCO International, Inc. is a recognized leader in delivering cost-effective, condensing commercial boilers, high-efficiency water heaters across a variety of markets including education, lodging, government, office buildings, healthcare, industrial and multifamily housing. AERCO's system design approach provides customer-specific solutions that deliver superior building performance at a lower operating cost while assuring uptime reliability. When AERCO was founded in 1949, it introduced a revolutionary design for an indirect-fired water heater that heated water on demand, and without storage, at a controlled temperature. This innovation became today's standard for water heaters, maximizing the recovery of latent heat energy and significantly increasing operating efficiency. AERCO continued to innovate and in 1988, introduced the first condensing and fully modulating boiler and water heater to the commercial market. The modulating capability of these products, still unsurpassed more than 25 years later, matches the equipment's output to real-time heating demand, ensuring the units draw no more fuel to operate than is absolutely necessary. This not only saves precious energy, but also ensures money doesn't needlessly disappear "up the stack." AERCO differentiates itself through a solution-based model, leveraging decades of engineering experience and industry application expertise to understand each customer’s unique needs. By partnering directly with customers and end-users to understand their project-specific requirements, AERCO provides tailored application solutions that are comprised of original product technologies including high efficiency condensing products, compact footprints, high turndown ratios, unique fuel delivery, leading control systems and proprietary design elements that combine to deliver up to 99% efficiency. Website http://www.aerco.com Phone 845-580-8000Phone number is 845-580-8000 Industry Industrial Machinery Manufacturing Company size 51-200 employees 119 on LinkedIn Includes members with current employer listed as AERCO International, Inc., including part-time roles. Headquarters Blauvelt, NY Founded 1949 Specialties Leading manufacturer of condensing boilers, water heating and energy recovery products and The originator of semi-instantaneous water heating
+Prince PLC: Overview We are a global leader of quality water solutions for residential, industrial, municipal, and commercial settings. Our family of brands offers one of the most varied product lines in the world, with world-class, water-related solutions focused on: • Plumbing & Flow Control • Water Quality & Conditioning • Water Reuse & Drainage • HVAC • Municipal Waterworks Strategic Goals Watts Water is traded on the New York Stock Exchange under the symbol “WTS.” As a public company, growing shareholder value is critical. To that end, we focus on a five-part Global Strategy: Growth, Commercial Excellence, Operational Excellence, “One Watts Water,” and a Talent & Performance Culture. Follow us on all social media platforms @WattsWater Website http://www.watts.com/ Industry Wholesale Building Materials Company size 5,001-10,000 employees 2,248 on LinkedIn Includes members with current employer listed as Watts Water Technologies, including part-time roles. Headquarters North Andover, MA Specialties Plumbing, HVAC, Water Quality, Gas, Conditioning, Waterworks, and Drainage
+About Courtyard Hotels is Marriott International’s largest hotel brand, with more than 1,100 hotels in over 50 countries worldwide. So, no matter where passion takes you, you’ll find us there to help you follow it. Proud members of Marriott Bonvoy.
\ No newline at end of file
diff --git a/applications/ColossalQA/data/data_sample/companies_zh.txt b/applications/ColossalQA/data/data_sample/companies_zh.txt
new file mode 100644
index 000000000000..a67a93590ee8
--- /dev/null
+++ b/applications/ColossalQA/data/data_sample/companies_zh.txt
@@ -0,0 +1,6 @@
+《海峡时报》是SPH传媒旗下的英文旗舰日报,SPH传媒是亚洲领先的传媒公司之一。《海峡时报》创刊于1845年7月15日,全面报道国内外新闻,是新加坡发行量最大的报纸。高质量的新闻、深入的分析、有影响力的评论和突发事件,为读者提供新加坡、该地区乃至其他地区的引人入胜的事件报道。无论是纸媒还是电子版,它都是新加坡阅读量最大的报纸,每天有133万人阅读。《海峡时报》的主要优势在于它对新加坡以外新闻的世界级报道。《海峡时报》记者在全球主要城市设有20个分社,用新加坡的盘子把世界新闻带给读者,帮助读者从新加坡的角度了解世界大事。网站http://www.straitstimes.com电话63196319电话63196319工业报纸出版公司规模1,001-5,000员工LinkedIn 183包括目前雇主为海峡时报的成员,包括兼职工作。总部位于新加坡,新加坡成立于1845年,专业从事新闻和数字媒体
+万豪酒店在全球拥有500多家酒店,以超越商务、团体和休闲旅客的期望,重塑酒店服务。
+万豪酒店(Marriott Hotels)是万豪旗下优质、全方位服务酒店和度假村的旗舰品牌,为客人提供始终如一、可靠和真诚关怀的体验。万豪是一个出色的主人,客人可以轻松地将生活和工作融合在一起,并受到现代旅行如何增强两者的启发。我们的酒店提供热情、专业的服务;精致而实用的客房设计;大堂空间,方便工作、餐饮和社交;餐厅和酒吧提供简单的国际美食和最新鲜的食材;会议及活动场地及服务均属黄金标准;还有宽敞的24小时健身设施。
+AERCO International, Inc.是公认的领导者,为教育、住宿、政府、办公楼、医疗保健、工业和多户住宅等各种市场提供具有成本效益的冷凝商用锅炉和高效热水器。AERCO的系统设计方法为客户提供特定的解决方案,以较低的运营成本提供卓越的建筑性能,同时确保正常运行时间的可靠性。AERCO成立于1949年,它推出了一种革命性的设计,用于间接燃烧热水器,在控制温度下按需加热水,而无需储存。这一创新成为当今热水器的标准,最大限度地回收潜热能量,显著提高运行效率。AERCO不断创新,并于1988年向商业市场推出了第一台冷凝和全调制锅炉和热水器。这些产品的调制能力,在超过25年后仍然无与伦比,使设备的输出与实时加热需求相匹配,确保机组不会消耗更多的燃料来运行,除非绝对必要。这不仅节省了宝贵的能源,还确保了钱不会不必要地消失在“堆栈”上。AERCO通过基于解决方案的模式脱颖而出,利用数十年的工程经验和行业应用专业知识来了解每个客户的独特需求。通过与客户和最终用户直接合作,了解他们的项目具体要求,AERCO提供量身定制的应用解决方案,这些解决方案由原创产品技术组成,包括高效冷凝产品,紧凑的足迹,高降压比,独特的燃料输送,领先的控制系统和专有设计元素,结合起来可提供高达99%的效率。网址http://www.aerco.com电话845-580- 8000电话号码845-580-8000工业工业机械制造公司规模51-200名员工LinkedIn上包括当前雇主AERCO International, Inc的成员,包括兼职职位。总部成立于1949年,纽约州布劳维尔特,专长:冷凝锅炉,水加热和能源回收产品的领先制造商,半瞬时水加热的鼻祖
+Prince PLC:概述Prince PLC是为住宅、工业、市政和商业环境提供优质水解决方案的全球领导者。我们的品牌家族提供世界上最多样化的产品线之一,拥有世界级的水相关解决方案,专注于:•管道和流量控制•水质和调理•水再利用和排水•hvac•市政水务战略目标瓦茨水务在纽约证券交易所上市,代码为“WTS”。作为一家上市公司,股东价值的增长至关重要。为此,我们将重点放在五部分全球战略上:增长、卓越商业、卓越运营、“一瓦茨水”以及人才与绩效文化。在所有社交媒体平台关注我们@WattsWater网站http://www.watts.com/行业批发建材公司规模5,001-10,000名员工领英2,248名包括目前雇主为WattsWater Technologies的成员,包括兼职职位。总部北安多弗,MA专业管道,暖通空调,水质,气体,空调,自来水厂和排水
+万怡酒店是万豪国际最大的酒店品牌,在全球50多个国家拥有1100多家酒店。所以,无论你的激情带你去哪里,你都会发现我们会帮助你追随它。万豪酒店的骄傲会员。
\ No newline at end of file
diff --git a/applications/ColossalQA/data/data_sample/csv_organization_100.csv b/applications/ColossalQA/data/data_sample/csv_organization_100.csv
new file mode 100644
index 000000000000..dbe97d5fd774
--- /dev/null
+++ b/applications/ColossalQA/data/data_sample/csv_organization_100.csv
@@ -0,0 +1,101 @@
+Index,Organization Id,Company Name,Website,Country,Description,Founded,Industry,Number of employees
+1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498
+2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952
+3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287
+4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921
+5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870
+6,cC757116fe1C085,Henry-Thompson,http://morse.net/,Bahamas,Face-to-face well-modulated customer loyalty,1992,Primary / Secondary Education,4914
+7,219233e8aFF1BC3,Hansen-Everett,https://www.kidd.org/,Pakistan,Seamless disintermediate collaboration,2018,Publishing Industry,7832
+8,ccc93DCF81a31CD,Mcintosh-Mora,https://www.brooks.com/,Heard Island and McDonald Islands,Centralized attitude-oriented capability,1970,Import / Export,4389
+9,0B4F93aA06ED03e,Carr Inc,http://ross.com/,Kuwait,Distributed impactful customer loyalty,1996,Plastics,8167
+10,738b5aDe6B1C6A5,Gaines Inc,http://sandoval-hooper.com/,Uzbekistan,Multi-lateral scalable protocol,1997,Outsourcing / Offshoring,9698
+11,AE61b8Ffebbc476,Kidd Group,http://www.lyons.com/,Bouvet Island (Bouvetoya),Proactive foreground paradigm,2001,Primary / Secondary Education,7473
+12,eb3B7D06cCdD609,Crane-Clarke,https://www.sandoval.com/,Denmark,Front-line clear-thinking encryption,2014,Food / Beverages,9011
+13,8D0c29189C9798B,"Keller, Campos and Black",https://www.garner.info/,Liberia,Ameliorated directional emulation,2020,Museums / Institutions,2862
+14,D2c91cc03CA394c,Glover-Pope,http://www.silva.biz/,United Arab Emirates,Persevering contextually-based approach,2013,Medical Practice,9079
+15,C8AC1eaf9C036F4,Pacheco-Spears,https://aguilar.com/,Sweden,Secured logistical synergy,1984,Maritime,769
+16,b5D10A14f7a8AfE,Hodge-Ayers,http://www.archer-elliott.com/,Honduras,Future-proofed radical implementation,1990,Facilities Services,8508
+17,68139b5C4De03B4,"Bowers, Guerra and Krause",http://www.carrillo-nicholson.com/,Uganda,De-engineered transitional strategy,1972,Primary / Secondary Education,6986
+18,5c2EffEfdba2BdF,Mckenzie-Melton,http://montoya-thompson.com/,Hong Kong,Reverse-engineered heuristic alliance,1998,Investment Management / Hedge Fund / Private Equity,4589
+19,ba179F19F7925f5,Branch-Mann,http://www.lozano.com/,Botswana,Adaptive intangible frame,1999,Architecture / Planning,7961
+20,c1Ce9B350BAc66b,Weiss and Sons,https://barrett.com/,Korea,Sharable optimal functionalities,2011,Plastics,5984
+21,8de40AC4e6EaCa4,"Velez, Payne and Coffey",http://burton.com/,Luxembourg,Mandatory coherent synergy,1986,Wholesale,5010
+22,Aad86a4F0385F2d,Harrell LLC,http://www.frey-rosario.com/,Guadeloupe,Reverse-engineered mission-critical moratorium,2018,Construction,2185
+23,22aC3FFd64fD703,"Eaton, Reynolds and Vargas",http://www.freeman.biz/,Monaco,Self-enabling multi-tasking process improvement,2014,Luxury Goods / Jewelry,8987
+24,5Ec4C272bCf085c,Robbins-Cummings,http://donaldson-wilkins.com/,Belgium,Organic non-volatile hierarchy,1991,Pharmaceuticals,5038
+25,5fDBeA8BB91a000,Jenkins Inc,http://www.kirk.biz/,South Africa,Front-line systematic help-desk,2002,Insurance,1215
+26,dFfD6a6F9AC2d9C,"Greene, Benjamin and Novak",http://www.kent.net/,Romania,Centralized leadingedge moratorium,2012,Museums / Institutions,4941
+27,4B217cC5a0674C5,"Dickson, Richmond and Clay",http://everett.com/,Czech Republic,Team-oriented tangible complexity,1980,Real Estate / Mortgage,3122
+28,88b1f1cDcf59a37,Prince-David,http://thompson.com/,Christmas Island,Virtual holistic methodology,1970,Banking / Mortgage,1046
+29,f9F7bBCAEeC360F,Ayala LLC,http://www.zhang.com/,Philippines,Open-source zero administration hierarchy,2021,Legal Services,7664
+30,7Cb3AeFcE4Ba31e,Rivas Group,https://hebert.org/,Australia,Open-architected well-modulated capacity,1998,Logistics / Procurement,4155
+31,ccBcC32adcbc530,"Sloan, Mays and Whitehead",http://lawson.com/,Chad,Face-to-face high-level conglomeration,1997,Civil Engineering,365
+32,f5afd686b3d05F5,"Durham, Allen and Barnes",http://chan-stafford.org/,Zimbabwe,Synergistic web-enabled framework,1993,Mechanical or Industrial Engineering,6135
+33,38C6cfC5074Fa5e,Fritz-Franklin,http://www.lambert.com/,Nepal,Automated 4thgeneration website,1972,Hospitality,4516
+34,5Cd7efccCcba38f,Burch-Ewing,http://cline.net/,Taiwan,User-centric 4thgeneration system engine,1981,Venture Capital / VC,7443
+35,9E6Acb51e3F9d6F,"Glass, Barrera and Turner",https://dunlap.com/,Kyrgyz Republic,Multi-channeled 3rdgeneration open system,2020,Utilities,2610
+36,4D4d7E18321eaeC,Pineda-Cox,http://aguilar.org/,Bolivia,Fundamental asynchronous capability,2010,Human Resources / HR,1312
+37,485f5d06B938F2b,"Baker, Mccann and Macdonald",http://www.anderson-barker.com/,Kenya,Cross-group user-facing focus group,2013,Legislative Office,1638
+38,19E3a5Bf6dBDc4F,Cuevas-Moss,https://dodson-castaneda.net/,Guatemala,Extended human-resource intranet,1994,Music,9995
+39,6883A965c7b68F7,Hahn PLC,http://newman.com/,Belarus,Organic logistical leverage,2012,Electrical / Electronic Manufacturing,3715
+40,AC5B7AA74Aa4A2E,"Valentine, Ferguson and Kramer",http://stuart.net/,Jersey,Centralized secondary time-frame,1997,Non - Profit / Volunteering,3585
+41,decab0D5027CA6a,Arroyo Inc,https://www.turner.com/,Grenada,Managed demand-driven website,2006,Writing / Editing,9067
+42,dF084FbBb613eea,Walls LLC,http://www.reese-vasquez.biz/,Cape Verde,Self-enabling fresh-thinking installation,1989,Investment Management / Hedge Fund / Private Equity,1678
+43,A2D89Ab9bCcAd4e,"Mitchell, Warren and Schneider",https://fox.biz/,Trinidad and Tobago,Enhanced intangible time-frame,2021,Capital Markets / Hedge Fund / Private Equity,3816
+44,77aDc905434a49f,Prince PLC,https://www.watts.com/,Sweden,Profit-focused coherent installation,2016,Individual / Family Services,7645
+45,235fdEFE2cfDa5F,Brock-Blackwell,http://www.small.com/,Benin,Secured foreground emulation,1986,Online Publishing,7034
+46,1eD64cFe986BBbE,Walton-Barnett,https://ashley-schaefer.com/,Western Sahara,Right-sized clear-thinking flexibility,2001,Luxury Goods / Jewelry,1746
+47,CbBbFcdd0eaE2cF,Bartlett-Arroyo,https://cruz.com/,Northern Mariana Islands,Realigned didactic function,1976,Civic / Social Organization,3987
+48,49aECbDaE6aBD53,"Wallace, Madden and Morris",http://www.blevins-fernandez.biz/,Germany,Persistent real-time customer loyalty,2016,Pharmaceuticals,9443
+49,7b3fe6e7E72bFa4,Berg-Sparks,https://cisneros-love.com/,Canada,Stand-alone static implementation,1974,Arts / Crafts,2073
+50,c6DedA82A8aef7E,Gonzales Ltd,http://bird.com/,Tonga,Managed human-resource policy,1988,Consumer Goods,9069
+51,7D9FBF85cdC3871,Lawson and Sons,https://www.wong.com/,French Southern Territories,Compatible analyzing intranet,2021,Arts / Crafts,3527
+52,7dd18Fb7cB07b65,"Mcguire, Mcconnell and Olsen",https://melton-briggs.com/,Korea,Profound client-server frame,1988,Printing,8445
+53,EF5B55FadccB8Fe,Charles-Phillips,https://bowman.com/,Cote d'Ivoire,Monitored client-server implementation,2012,Mental Health Care,3450
+54,f8D4B99e11fAF5D,Odom Ltd,https://www.humphrey-hess.com/,Cote d'Ivoire,Advanced static process improvement,2012,Management Consulting,1825
+55,e24D21BFd3bF1E5,Richard PLC,https://holden-coleman.net/,Mayotte,Object-based optimizing model,1971,Broadcast Media,4942
+56,B9BdfEB6D3Ca44E,Sampson Ltd,https://blevins.com/,Cayman Islands,Intuitive local adapter,2005,Farming,1418
+57,2a74D6f3D3B268e,"Cherry, Le and Callahan",https://waller-delacruz.biz/,Nigeria,Universal human-resource collaboration,2017,Entertainment / Movie Production,7202
+58,Bf3F3f62c8aBC33,Cherry PLC,https://www.avila.info/,Marshall Islands,Persistent tertiary website,1980,Plastics,8245
+59,aeBe26B80a7a23c,Melton-Nichols,https://kennedy.com/,Palau,User-friendly clear-thinking productivity,2021,Legislative Office,8741
+60,aAeb29ad43886C6,Potter-Walsh,http://thomas-french.org/,Turkey,Optional non-volatile open system,2008,Human Resources / HR,6923
+61,bD1bc6bB6d1FeD3,Freeman-Chen,https://mathis.com/,Timor-Leste,Phased next generation adapter,1973,International Trade / Development,346
+62,EB9f456e8b7022a,Soto Group,https://norris.info/,Vietnam,Enterprise-wide executive installation,1988,Business Supplies / Equipment,9097
+63,Dfef38C51D8DAe3,"Poole, Cruz and Whitney",https://reed.info/,Reunion,Balanced analyzing groupware,1978,Marketing / Advertising / Sales,2992
+64,055ffEfB2Dd95B0,Riley Ltd,http://wiley.com/,Brazil,Optional exuding superstructure,1986,Textiles,9315
+65,cBfe4dbAE1699da,"Erickson, Andrews and Bailey",https://www.hobbs-grant.com/,Eritrea,Vision-oriented secondary project,2014,Consumer Electronics,7829
+66,fdFbecbadcdCdf1,"Wilkinson, Charles and Arroyo",http://hunter-mcfarland.com/,United States Virgin Islands,Assimilated 24/7 archive,1996,Building Materials,602
+67,5DCb8A5a5ca03c0,Floyd Ltd,http://www.whitney.com/,Falkland Islands (Malvinas),Function-based fault-tolerant concept,2017,Public Relations / PR,2911
+68,ce57DCbcFD6d618,Newman-Galloway,https://www.scott.com/,Luxembourg,Enhanced foreground collaboration,1987,Information Technology / IT,3934
+69,5aaD187dc929371,Frazier-Butler,https://www.daugherty-farley.info/,Northern Mariana Islands,Persistent interactive circuit,1972,Outsourcing / Offshoring,5130
+70,902D7Ac8b6d476b,Newton Inc,https://www.richmond-manning.info/,Netherlands Antilles,Fundamental stable info-mediaries,1976,Military Industry,563
+71,32BB9Ff4d939788,Duffy-Levy,https://www.potter.com/,Guernsey,Diverse exuding installation,1982,Wireless,6146
+72,adcB0afbE58bAe3,Wagner LLC,https://decker-esparza.com/,Uruguay,Reactive attitude-oriented toolset,1987,International Affairs,6874
+73,dfcA1c84AdB61Ac,Mccall-Holmes,http://www.dean.com/,Benin,Object-based value-added database,2009,Legal Services,696
+74,208044AC2fe52F3,Massey LLC,https://frazier.biz/,Suriname,Configurable zero administration Graphical User Interface,1986,Accounting,5004
+75,f3C365f0c1A0623,Hicks LLC,http://alvarez.biz/,Pakistan,Quality-focused client-server Graphical User Interface,1970,Computer Software / Engineering,8480
+76,ec5Bdd3CBAfaB93,"Cole, Russell and Avery",http://www.blankenship.com/,Mongolia,De-engineered fault-tolerant challenge,2000,Law Enforcement,7012
+77,DDB19Be7eeB56B4,Cummings-Rojas,https://simon-pearson.com/,Svalbard & Jan Mayen Islands,User-centric modular customer loyalty,2012,Financial Services,7529
+78,dd6CA3d0bc3cAfc,"Beasley, Greene and Mahoney",http://www.petersen-lawrence.com/,Togo,Extended content-based methodology,1976,Religious Institutions,869
+79,A0B9d56e61070e3,"Beasley, Sims and Allison",http://burke.info/,Latvia,Secured zero tolerance hub,1972,Facilities Services,6182
+80,cBa7EFe5D05Adaf,Crawford-Rivera,https://black-ramirez.org/,Cuba,Persevering exuding budgetary management,1999,Online Publishing,7805
+81,Ea3f6D52Ec73563,Montes-Hensley,https://krueger.org/,Liechtenstein,Multi-tiered secondary productivity,2009,Printing,8433
+82,bC0CEd48A8000E0,Velazquez-Odom,https://stokes.com/,Djibouti,Streamlined 6thgeneration function,2002,Alternative Dispute Resolution,4044
+83,c89b9b59BC4baa1,Eaton-Morales,https://www.reeves-graham.com/,Micronesia,Customer-focused explicit frame,1990,Capital Markets / Hedge Fund / Private Equity,7013
+84,FEC51bce8421a7b,"Roberson, Pennington and Palmer",http://www.keith-fisher.com/,Cameroon,Adaptive bi-directional hierarchy,1993,Telecommunications,5571
+85,e0E8e27eAc9CAd5,"George, Russo and Guerra",https://drake.com/,Sweden,Centralized non-volatile capability,1989,Military Industry,2880
+86,B97a6CF9bf5983C,Davila Inc,https://mcconnell.info/,Cocos (Keeling) Islands,Profit-focused dedicated frame,2017,Consumer Electronics,2215
+87,a0a6f9b3DbcBEb5,Mays-Preston,http://www.browning-key.com/,Mali,User-centric heuristic focus group,2006,Military Industry,5786
+88,8cC1bDa330a5871,Pineda-Morton,https://www.carr.com/,United States Virgin Islands,Grass-roots methodical info-mediaries,1991,Printing,6168
+89,ED889CB2FE9cbd3,Huang and Sons,https://www.bolton.com/,Eritrea,Re-contextualized dynamic hierarchy,1981,Semiconductors,7484
+90,F4Dc1417BC6cb8f,Gilbert-Simon,https://www.bradford.biz/,Burundi,Grass-roots radical parallelism,1973,Newspapers / Journalism,1927
+91,7ABc3c7ecA03B34,Sampson-Griffith,http://hendricks.org/,Benin,Multi-layered composite paradigm,1972,Textiles,3881
+92,4e0719FBE38e0aB,Miles-Dominguez,http://www.turner.com/,Gibraltar,Organized empowering forecast,1996,Civic / Social Organization,897
+93,dEbDAAeDfaed00A,Rowe and Sons,https://www.simpson.org/,El Salvador,Balanced multimedia knowledgebase,1978,Facilities Services,8172
+94,61BDeCfeFD0cEF5,"Valenzuela, Holmes and Rowland",https://www.dorsey.net/,Taiwan,Persistent tertiary focus group,1999,Transportation,1483
+95,4e91eD25f486110,"Best, Wade and Shepard",https://zimmerman.com/,Zimbabwe,Innovative background definition,1991,Gambling / Casinos,4873
+96,0a0bfFbBbB8eC7c,Holmes Group,https://mcdowell.org/,Ethiopia,Right-sized zero tolerance focus group,1975,Photography,2988
+97,BA6Cd9Dae2Efd62,Good Ltd,http://duffy.com/,Anguilla,Reverse-engineered composite moratorium,1971,Consumer Services,4292
+98,E7df80C60Abd7f9,Clements-Espinoza,http://www.flowers.net/,Falkland Islands (Malvinas),Progressive modular hub,1991,Broadcast Media,236
+99,AFc285dbE2fEd24,Mendez Inc,https://www.burke.net/,Kyrgyz Republic,User-friendly exuding migration,1993,Education Management,339
+100,e9eB5A60Cef8354,Watkins-Kaiser,http://www.herring.com/,Togo,Synergistic background access,2009,Financial Services,2785
\ No newline at end of file
diff --git a/applications/ColossalQA/data/data_sample/custom_service.json b/applications/ColossalQA/data/data_sample/custom_service.json
new file mode 100644
index 000000000000..33dbdf705707
--- /dev/null
+++ b/applications/ColossalQA/data/data_sample/custom_service.json
@@ -0,0 +1 @@
+{"features":[{"feature_idx":0,"name":"query","type":{"dtype":"string","_type":"Value"}},{"feature_idx":1,"name":"response","type":{"dtype":"string","_type":"Value"}}],"rows":[{"row_idx":0,"row":{"query":"My order hasn't arrived yet.","response":"We apologize for the inconvenience. Can you please provide your order number so we can investigate?"},"truncated_cells":[]},{"row_idx":1,"row":{"query":"I received a damaged product.","response":"We apologize for the inconvenience. Can you please provide a photo of the damaged product so we can assist you further?"},"truncated_cells":[]},{"row_idx":2,"row":{"query":"I need to return an item.","response":"Certainly. Please provide your order number and reason for return, and we will provide you with instructions on how to proceed."},"truncated_cells":[]},{"row_idx":3,"row":{"query":"I want to change my shipping address.","response":"No problem. Can you please provide your order number and the new shipping address you'd like to use?"},"truncated_cells":[]},{"row_idx":4,"row":{"query":"I have a question about my bill.","response":"We'd be happy to help. Can you please provide your account number and a brief description of your question?"},"truncated_cells":[]},{"row_idx":5,"row":{"query":"How do I cancel my subscription?","response":"We're sorry to hear that. Can you please provide your account email and the name of the subscription you'd like to cancel?"},"truncated_cells":[]},{"row_idx":6,"row":{"query":"Can I get a refund for my purchase?","response":"We understand your concern. Please provide your order number and the reason for your refund request so we can assist you further."},"truncated_cells":[]},{"row_idx":7,"row":{"query":"I'd like to track my order.","response":"Of course. Can you please provide your order number so we can check the current status for you?"},"truncated_cells":[]},{"row_idx":8,"row":{"query":"My account has been locked.","response":"We apologize for the inconvenience. Can you please provide your account email so we can help you regain access?"},"truncated_cells":[]},{"row_idx":9,"row":{"query":"I can't find the item I'm looking for.","response":"We're here to help. Can you please provide a description or product name of the item you're looking for so we can assist you?"},"truncated_cells":[]},{"row_idx":10,"row":{"query":"I'm having trouble applying a promo code.","response":"We apologize for the inconvenience. Can you please provide the promo code you're trying to use, and we'll help you resolve the issue?"},"truncated_cells":[]},{"row_idx":11,"row":{"query":"How do I update my payment information?","response":"We can help with that. Can you please provide your account email so we can guide you through updating your payment information?"},"truncated_cells":[]},{"row_idx":12,"row":{"query":"I haven't received a response to my email inquiry.","response":"We apologize for the delay. Can you please provide your ticket number or account email so we can follow up on your inquiry?"},"truncated_cells":[]},{"row_idx":13,"row":{"query":"Can I change the size of an item in my order?","response":"Certainly. Can you please provide your order number and the details of the item you'd like to change?"},"truncated_cells":[]},{"row_idx":14,"row":{"query":"What is the status of my warranty claim?","response":"We'd be happy to check for you. Can you please provide your claim number or the product's serial number?"},"truncated_cells":[]},{"row_idx":15,"row":{"query":"Is my item in stock?","response":"To check availability, can you please provide the product name or SKU so we can assist you?"},"truncated_cells":[]},{"row_idx":16,"row":{"query":"My promo code has expired. Can I still use it?","response":"Unfortunately, expired promo codes cannot be used. However, please provide the code, and we'll see if there's a similar offer currently available."},"truncated_cells":[]},{"row_idx":17,"row":{"query":"How do I reset my password?","response":"We can help with that. Please provide your account email, and we'll send you instructions on how to reset your password."},"truncated_cells":[]},{"row_idx":18,"row":{"query":"Can I place a bulk order?","response":"Certainly. Can you please provide the product name or SKU, and the quantity you'd like to order so we can check availability and pricing?"},"truncated_cells":[]},{"row_idx":19,"row":{"query":"How do I apply for a job at your company?","response":"Thank you for your interest. Can you please provide your email address so we can send you information on how to apply?"},"truncated_cells":[]},{"row_idx":20,"row":{"query":"Can I add an item to an existing order?","response":"We'll do our best to help. Can you please provide your order number and the details of the item you'd like to add?"},"truncated_cells":[]},{"row_idx":21,"row":{"query":"How long does shipping take?","response":"We'd be happy to provide an estimate. Can you please provide your shipping destination and the product name or SKU?"},"truncated_cells":[]},{"row_idx":22,"row":{"query":"I was charged twice for my order.","response":"We apologize for the inconvenience. Can you please provide your order number so we can investigate and resolve the issue?"},"truncated_cells":[]},{"row_idx":23,"row":{"query":"Where can I find your sizing chart?","response":"We'd be happy to help. Can you please provide the product name or SKU so we can direct you to the appropriate sizing chart?"},"truncated_cells":[]},{"row_idx":24,"row":{"query":"I need technical support for a product.","response":"We're here to help. Can you please provide the product name, SKU, or serial number, and a description of the issue you're experiencing?"},"truncated_cells":[]},{"row_idx":25,"row":{"query":"Can I exchange an item for a different color?","response":"Certainly. Can you please provide your order number and the details of the item you'd like to exchange?"},"truncated_cells":[]},{"row_idx":26,"row":{"query":"How do I apply for a store credit card?","response":"Thank you for your interest. Can you please provide your email address so we can send you information on how to apply?"},"truncated_cells":[]},{"row_idx":27,"row":{"query":"Where is your store located?","response":"We'd be happy to help you find a store near you. Can you please provide your city and state or zip code?"},"truncated_cells":[]},{"row_idx":28,"row":{"query":"I can't log into my account.","response":"We apologize for the inconvenience. Can you please provide your account email so we can help you troubleshoot the issue?"},"truncated_cells":[]},{"row_idx":29,"row":{"query":"Do you offer gift wrapping?","response":"We do offer gift wrapping for select items. Can you please provide the product name or SKU so we can confirm if gift wrapping is available?"},"truncated_cells":[]},{"row_idx":30,"row":{"query":"Can I schedule a delivery?","response":"Certainly. Can you please provide your order number and the preferred date and time for delivery?"},"truncated_cells":[]},{"row_idx":31,"row":{"query":"How do I unsubscribe from your newsletter?","response":"We're sorry to see you go. Can you please provide your email address so we can remove you from our mailing list?"},"truncated_cells":[]},{"row_idx":32,"row":{"query":"What is your return policy?","response":"We'd be happy to provide information on our return policy. Can you please provide the product name or SKU so we can give you the most accurate information?"},"truncated_cells":[]},{"row_idx":33,"row":{"query":"Do you offer price matching?","response":"We do offer price matching in certain cases. Can you please provide the product name or SKU and the competitor's pricing information?"},"truncated_cells":[]},{"row_idx":34,"row":{"query":"Are there any current promotions or sales?","response":"We'd be happy to inform you of any current promotions. Can you please provide the product name or SKU you're interested in, or the type of promotion you're looking for?"},"truncated_cells":[]},{"row_idx":35,"row":{"query":"My coupon isn't working.","response":"We apologize for the inconvenience. Can you please provide the coupon code and the product name or SKU you're trying to apply it to?"},"truncated_cells":[]},{"row_idx":36,"row":{"query":"Can you help me with a product recommendation?","response":"Of course! Can you please provide some details about what you're looking for, such as product type, features, or price range?"},"truncated_cells":[]},{"row_idx":37,"row":{"query":"Do you offer international shipping?","response":"We'd be happy to help. Can you please provide your shipping destination and the product name or SKU you're interested in?"},"truncated_cells":[]},{"row_idx":38,"row":{"query":"Can I place an order over the phone?","response":"Certainly. Can you please provide your phone number and the best time to reach you, and one of our representatives will contact you to place the order?"},"truncated_cells":[]},{"row_idx":39,"row":{"query":"How do I know if my order went through?","response":"We can help you with that. Can you please provide your account email or order number so we can check the status for you?"},"truncated_cells":[]},{"row_idx":40,"row":{"query":"Can I use multiple promo codes on one order?","response":"In most cases, only one promo code can be applied per order. Can you please provide the promo codes you're trying to use so we can check their compatibility?"},"truncated_cells":[]},{"row_idx":41,"row":{"query":"What are your customer service hours?","response":"Our customer service team is available 24/7 to assist you. Please feel free to reach out with any questions or concerns."},"truncated_cells":[]},{"row_idx":42,"row":{"query":"Where can I find the user manual for my product?","response":"We'd be happy to help. Can you please provide the product name, SKU, or serial number so we can direct you to the appropriate user manual?"},"truncated_cells":[]},{"row_idx":43,"row":{"query":"Do you offer a warranty on your products?","response":"We do offer warranties on select products. Can you please provide the product name or SKU so we can provide you with warranty information?"},"truncated_cells":[]},{"row_idx":44,"row":{"query":"Can I place an order for in-store pickup?","response":"Certainly. Can you please provide the product name or SKU and the store location where you'd like to pick up your order?"},"truncated_cells":[]},{"row_idx":45,"row":{"query":"How do I sign up for your rewards program?","response":"Thank you for your interest in our rewards program. Can you please provide your email address so we can send you information on how to sign up?"},"truncated_cells":[]},{"row_idx":46,"row":{"query":"Can I pay with a gift card online?","response":"Yes, you can use gift cards for online purchases. Can you please provide the gift card number and the product name or SKU you're interested in?"},"truncated_cells":[]},{"row_idx":47,"row":{"query":"I can't find my order confirmation email.","response":"We apologize for the inconvenience. Can you please provide your account email or order number so we can resend the confirmation email?"},"truncated_cells":[]},{"row_idx":48,"row":{"query":"Do you offer a military discount?","response":"Yes, we do offer a military discount. Can you please provide your military ID or email address so we can apply the discount to your account?"},"truncated_cells":[]},{"row_idx":49,"row":{"query":"What is the processing time for my order?","response":"We'd be happy to provide an estimate. Can you please provide your order number or the product name or SKU?"},"truncated_cells":[]},{"row_idx":50,"row":{"query":"How do I update my shipping preferences?","response":"We can help with that. Can you please provide your account email so we can guide you through updating your shipping preferences?"},"truncated_cells":[]},{"row_idx":51,"row":{"query":"Is my payment information secure?","response":"Yes, we take security very seriously. Can you please provide your account email so we can verify the security measures in place for your payment information?"},"truncated_cells":[]},{"row_idx":52,"row":{"query":"Can I pre-order an item?","response":"Certainly. Can you please provide the product name or SKU and your email address so we can notify you when pre-orders are available?"},"truncated_cells":[]},{"row_idx":53,"row":{"query":"How do I use a gift card in-store?","response":"To use a gift card in-store, simply present the gift card at the time of purchase. Can you please provide the gift card number so we can check the balance for you?"},"truncated_cells":[]},{"row_idx":54,"row":{"query":"Do you have a loyalty program?","response":"Yes, we do have a loyalty program. Can you please provide your email address so we can send you information on how to join and enjoy the benefits?"},"truncated_cells":[]},{"row_idx":55,"row":{"query":"Is there a mobile app for your store?","response":"Yes, we do have a mobile app. Can you please provide your email address so we can send you a link to download the app and instructions on how to use it?"},"truncated_cells":[]},{"row_idx":56,"row":{"query":"I need help assembling my product.","response":"We're here to help. Can you please provide the product name, SKU, or serial number, and a description of the issue you're experiencing during assembly?"},"truncated_cells":[]},{"row_idx":57,"row":{"query":"Do you offer financing options?","response":"We do offer financing options for select purchases. Can you please provide the product name or SKU and your email address so we can send you more information?"},"truncated_cells":[]},{"row_idx":58,"row":{"query":"Can I reserve an item in-store?","response":"Certainly. Can you please provide the product name or SKU and the store location where you'd like to reserve the item?"},"truncated_cells":[]},{"row_idx":59,"row":{"query":"How do I get a price adjustment for a recent purchase?","response":"We'd be happy to help. Can you please provide your order number and the product name or SKU for the item you'd like a price adjustment on?"},"truncated_cells":[]},{"row_idx":60,"row":{"query":"How do I change my email preferences?","response":"We can help with that. Can you please provide your account email so we can guide you through updating your email preferences?"},"truncated_cells":[]},{"row_idx":61,"row":{"query":"Can I use my store credit online?","response":"Yes, you can use store credit for online purchases. Can you please provide the store credit number and the product name or SKU you're interested in?"},"truncated_cells":[]},{"row_idx":62,"row":{"query":"What are the washing instructions for this item?","response":"We'd be happy to help. Can you please provide the product name or SKU so we can provide you with the proper washing instructions?"},"truncated_cells":[]},{"row_idx":63,"row":{"query":"Can I get a replacement part for my product?","response":"Certainly. Can you please provide the product name, SKU, or serial number, and a description of the part you need?"},"truncated_cells":[]},{"row_idx":64,"row":{"query":"Do you offer free shipping?","response":"We do offer free shipping on select orders. Can you please provide the product name or SKU and your shipping destination so we can check if your order qualifies?"},"truncated_cells":[]},{"row_idx":65,"row":{"query":"Can I place a custom order?","response":"We'd be happy to assist you. Can you please provide the product name or SKU and a description of the customizations you'd like?"},"truncated_cells":[]},{"row_idx":66,"row":{"query":"How do I report a problem with your website?","response":"We appreciate your feedback. Can you please provide a description of the issue you're experiencing and your email address so we can follow up with you?"},"truncated_cells":[]},{"row_idx":67,"row":{"query":"What is your policy on price adjustments?","response":"We'd be happy to provide information on our price adjustment policy. Can you please provide the product name or SKU so we can give you the most accurate information?"},"truncated_cells":[]},{"row_idx":68,"row":{"query":"Do you have any upcoming sales or events?","response":"We'd be happy to inform you of any upcoming sales or events. Can you please provide your email address so we can keep you updated?"},"truncated_cells":[]},{"row_idx":69,"row":{"query":"How do I schedule a consultation or appointment?","response":"We'd be happy to help. Can you please provide your name, phone number, and the service you're interested in so we can schedule your appointment?"},"truncated_cells":[]},{"row_idx":70,"row":{"query":"Can I get a copy of my receipt?","response":"Certainly. Can you please provide your order number or account email so we can locate your receipt and send you a copy?"},"truncated_cells":[]},{"row_idx":71,"row":{"query":"Can I use a competitor's coupon at your store?","response":"In some cases, we may accept competitor coupons. Can you please provide the competitor's coupon code and the product name or SKU you'd like to apply it to?"},"truncated_cells":[]},{"row_idx":72,"row":{"query":"Do you have a recycling program?","response":"Yes, we do have a recycling program. Can you please provide your email address so we can send you information on how to participate?"},"truncated_cells":[]},{"row_idx":73,"row":{"query":"How do I report a lost or stolen gift card?","response":"We're sorry to hear that. Can you please provide the gift card number, if available, and your email address so we can assist you further?"},"truncated_cells":[]}],"num_rows_total":74,"num_rows_per_page":100}
diff --git a/applications/ColossalQA/data/data_sample/custom_service_classification.json b/applications/ColossalQA/data/data_sample/custom_service_classification.json
new file mode 100644
index 000000000000..2d7e4b05d217
--- /dev/null
+++ b/applications/ColossalQA/data/data_sample/custom_service_classification.json
@@ -0,0 +1,64 @@
+{
+ "data": [
+ {
+ "key": "客户反映手机无法接收短信,但可以正常拨打电话,已确认手机号码正常,需要处理。",
+ "value": "故障原因分类: 短信接收问题"
+ },
+ {
+ "key": "客户申请开通国际漫游服务,但在目的地无法使用手机信号,已核实客户所在地国家为不支持漫游的区域,已通知客户。",
+ "value": "故障原因分类: 国际漫游服务"
+ },
+ {
+ "key": "客户称手机信号时强时弱,经过测试发现在不同区域信号确实存在波动,属于正常现象。",
+ "value": "故障原因分类: 信号强弱波动"
+ },
+ {
+ "key": "客户反映在家中无法连接Wi-Fi,建议检查路由器或尝试更换位置。",
+ "value": "故障原因分类: 家庭网络问题"
+ },
+ {
+ "key": "客户申请更换新的SIM卡,因旧卡损坏,已为客户办理新卡。",
+ "value": "故障原因分类: SIM卡更换"
+ },
+ {
+ "key": "客户反映通话时听不清对方声音,经检查发现是手机内置扬声器故障,建议维修。",
+ "value": "故障原因分类: 扬声器故障"
+ },
+ {
+ "key": "客户手机丢失,请求挂失并办理新卡,已为客户挂失旧卡并补办新卡。",
+ "value": "故障原因分类: 挂失与补办"
+ },
+ {
+ "key": "客户反映在市区内无法使用手机信号,经排查发现信号塔维护,属于暂时性故障。",
+ "value": "故障原因分类: 信号塔维护"
+ },
+ {
+ "key": "客户反映手机充电时出现过热情况,建议更换充电器。",
+ "value": "故障原因分类: 充电器故障"
+ },
+ {
+ "key": "客户要求关闭数据漫游功能,已为客户关闭。",
+ "value": "故障原因分类: 关闭数据漫游"
+ },
+ {
+ "key": "客户申请办理家庭套餐业务,已为客户办理。",
+ "value": "故障原因分类: 家庭套餐办理"
+ },
+ {
+ "key": "客户反映在商场内无法使用手机信号,建议检查手机信号设置。",
+ "value": "故障原因分类: 手机信号设置"
+ },
+ {
+ "key": "客户申请开通国际长途业务,已为客户办理。",
+ "value": "故障原因分类: 国际长途业务办理"
+ },
+ {
+ "key": "客户反映手机屏幕出现蓝屏,建议客户前往维修。",
+ "value": "故障原因分类: 手机屏幕故障"
+ },
+ {
+ "key": "客户申请办理免流量业务,已为客户办理。",
+ "value": "故障原因分类: 免流量业务办理"
+ }
+ ]
+}
diff --git a/applications/ColossalQA/data/data_sample/custom_service_preprocessed.json b/applications/ColossalQA/data/data_sample/custom_service_preprocessed.json
new file mode 100644
index 000000000000..f4e095f162af
--- /dev/null
+++ b/applications/ColossalQA/data/data_sample/custom_service_preprocessed.json
@@ -0,0 +1 @@
+{"data": [{"key": "My order hasn't arrived yet.", "value": "We apologize for the inconvenience. Can you please provide your order number so we can investigate?"}, {"key": "I received a damaged product.", "value": "We apologize for the inconvenience. Can you please provide a photo of the damaged product so we can assist you further?"}, {"key": "I need to return an item.", "value": "Certainly. Please provide your order number and reason for return, and we will provide you with instructions on how to proceed."}, {"key": "I want to change my shipping address.", "value": "No problem. Can you please provide your order number and the new shipping address you'd like to use?"}, {"key": "I have a question about my bill.", "value": "We'd be happy to help. Can you please provide your account number and a brief description of your question?"}, {"key": "How do I cancel my subscription?", "value": "We're sorry to hear that. Can you please provide your account email and the name of the subscription you'd like to cancel?"}, {"key": "Can I get a refund for my purchase?", "value": "We understand your concern. Please provide your order number and the reason for your refund request so we can assist you further."}, {"key": "I'd like to track my order.", "value": "Of course. Can you please provide your order number so we can check the current status for you?"}, {"key": "My account has been locked.", "value": "We apologize for the inconvenience. Can you please provide your account email so we can help you regain access?"}, {"key": "I can't find the item I'm looking for.", "value": "We're here to help. Can you please provide a description or product name of the item you're looking for so we can assist you?"}, {"key": "I'm having trouble applying a promo code.", "value": "We apologize for the inconvenience. Can you please provide the promo code you're trying to use, and we'll help you resolve the issue?"}, {"key": "How do I update my payment information?", "value": "We can help with that. Can you please provide your account email so we can guide you through updating your payment information?"}, {"key": "I haven't received a response to my email inquiry.", "value": "We apologize for the delay. Can you please provide your ticket number or account email so we can follow up on your inquiry?"}, {"key": "Can I change the size of an item in my order?", "value": "Certainly. Can you please provide your order number and the details of the item you'd like to change?"}, {"key": "What is the status of my warranty claim?", "value": "We'd be happy to check for you. Can you please provide your claim number or the product's serial number?"}, {"key": "Is my item in stock?", "value": "To check availability, can you please provide the product name or SKU so we can assist you?"}, {"key": "My promo code has expired. Can I still use it?", "value": "Unfortunately, expired promo codes cannot be used. However, please provide the code, and we'll see if there's a similar offer currently available."}, {"key": "How do I reset my password?", "value": "We can help with that. Please provide your account email, and we'll send you instructions on how to reset your password."}, {"key": "Can I place a bulk order?", "value": "Certainly. Can you please provide the product name or SKU, and the quantity you'd like to order so we can check availability and pricing?"}, {"key": "How do I apply for a job at your company?", "value": "Thank you for your interest. Can you please provide your email address so we can send you information on how to apply?"}, {"key": "Can I add an item to an existing order?", "value": "We'll do our best to help. Can you please provide your order number and the details of the item you'd like to add?"}, {"key": "How long does shipping take?", "value": "We'd be happy to provide an estimate. Can you please provide your shipping destination and the product name or SKU?"}, {"key": "I was charged twice for my order.", "value": "We apologize for the inconvenience. Can you please provide your order number so we can investigate and resolve the issue?"}, {"key": "Where can I find your sizing chart?", "value": "We'd be happy to help. Can you please provide the product name or SKU so we can direct you to the appropriate sizing chart?"}, {"key": "I need technical support for a product.", "value": "We're here to help. Can you please provide the product name, SKU, or serial number, and a description of the issue you're experiencing?"}, {"key": "Can I exchange an item for a different color?", "value": "Certainly. Can you please provide your order number and the details of the item you'd like to exchange?"}, {"key": "How do I apply for a store credit card?", "value": "Thank you for your interest. Can you please provide your email address so we can send you information on how to apply?"}, {"key": "Where is your store located?", "value": "We'd be happy to help you find a store near you. Can you please provide your city and state or zip code?"}, {"key": "I can't log into my account.", "value": "We apologize for the inconvenience. Can you please provide your account email so we can help you troubleshoot the issue?"}, {"key": "Do you offer gift wrapping?", "value": "We do offer gift wrapping for select items. Can you please provide the product name or SKU so we can confirm if gift wrapping is available?"}, {"key": "Can I schedule a delivery?", "value": "Certainly. Can you please provide your order number and the preferred date and time for delivery?"}, {"key": "How do I unsubscribe from your newsletter?", "value": "We're sorry to see you go. Can you please provide your email address so we can remove you from our mailing list?"}, {"key": "What is your return policy?", "value": "We'd be happy to provide information on our return policy. Can you please provide the product name or SKU so we can give you the most accurate information?"}, {"key": "Do you offer price matching?", "value": "We do offer price matching in certain cases. Can you please provide the product name or SKU and the competitor's pricing information?"}, {"key": "Are there any current promotions or sales?", "value": "We'd be happy to inform you of any current promotions. Can you please provide the product name or SKU you're interested in, or the type of promotion you're looking for?"}, {"key": "My coupon isn't working.", "value": "We apologize for the inconvenience. Can you please provide the coupon code and the product name or SKU you're trying to apply it to?"}, {"key": "Can you help me with a product recommendation?", "value": "Of course! Can you please provide some details about what you're looking for, such as product type, features, or price range?"}, {"key": "Do you offer international shipping?", "value": "We'd be happy to help. Can you please provide your shipping destination and the product name or SKU you're interested in?"}, {"key": "Can I place an order over the phone?", "value": "Certainly. Can you please provide your phone number and the best time to reach you, and one of our representatives will contact you to place the order?"}, {"key": "How do I know if my order went through?", "value": "We can help you with that. Can you please provide your account email or order number so we can check the status for you?"}, {"key": "Can I use multiple promo codes on one order?", "value": "In most cases, only one promo code can be applied per order. Can you please provide the promo codes you're trying to use so we can check their compatibility?"}, {"key": "What are your customer service hours?", "value": "Our customer service team is available 24/7 to assist you. Please feel free to reach out with any questions or concerns."}, {"key": "Where can I find the user manual for my product?", "value": "We'd be happy to help. Can you please provide the product name, SKU, or serial number so we can direct you to the appropriate user manual?"}, {"key": "Do you offer a warranty on your products?", "value": "We do offer warranties on select products. Can you please provide the product name or SKU so we can provide you with warranty information?"}, {"key": "Can I place an order for in-store pickup?", "value": "Certainly. Can you please provide the product name or SKU and the store location where you'd like to pick up your order?"}, {"key": "How do I sign up for your rewards program?", "value": "Thank you for your interest in our rewards program. Can you please provide your email address so we can send you information on how to sign up?"}, {"key": "Can I pay with a gift card online?", "value": "Yes, you can use gift cards for online purchases. Can you please provide the gift card number and the product name or SKU you're interested in?"}, {"key": "I can't find my order confirmation email.", "value": "We apologize for the inconvenience. Can you please provide your account email or order number so we can resend the confirmation email?"}, {"key": "Do you offer a military discount?", "value": "Yes, we do offer a military discount. Can you please provide your military ID or email address so we can apply the discount to your account?"}, {"key": "What is the processing time for my order?", "value": "We'd be happy to provide an estimate. Can you please provide your order number or the product name or SKU?"}, {"key": "How do I update my shipping preferences?", "value": "We can help with that. Can you please provide your account email so we can guide you through updating your shipping preferences?"}, {"key": "Is my payment information secure?", "value": "Yes, we take security very seriously. Can you please provide your account email so we can verify the security measures in place for your payment information?"}, {"key": "Can I pre-order an item?", "value": "Certainly. Can you please provide the product name or SKU and your email address so we can notify you when pre-orders are available?"}, {"key": "How do I use a gift card in-store?", "value": "To use a gift card in-store, simply present the gift card at the time of purchase. Can you please provide the gift card number so we can check the balance for you?"}, {"key": "Do you have a loyalty program?", "value": "Yes, we do have a loyalty program. Can you please provide your email address so we can send you information on how to join and enjoy the benefits?"}, {"key": "Is there a mobile app for your store?", "value": "Yes, we do have a mobile app. Can you please provide your email address so we can send you a link to download the app and instructions on how to use it?"}, {"key": "I need help assembling my product.", "value": "We're here to help. Can you please provide the product name, SKU, or serial number, and a description of the issue you're experiencing during assembly?"}, {"key": "Do you offer financing options?", "value": "We do offer financing options for select purchases. Can you please provide the product name or SKU and your email address so we can send you more information?"}, {"key": "Can I reserve an item in-store?", "value": "Certainly. Can you please provide the product name or SKU and the store location where you'd like to reserve the item?"}, {"key": "How do I get a price adjustment for a recent purchase?", "value": "We'd be happy to help. Can you please provide your order number and the product name or SKU for the item you'd like a price adjustment on?"}, {"key": "How do I change my email preferences?", "value": "We can help with that. Can you please provide your account email so we can guide you through updating your email preferences?"}, {"key": "Can I use my store credit online?", "value": "Yes, you can use store credit for online purchases. Can you please provide the store credit number and the product name or SKU you're interested in?"}, {"key": "What are the washing instructions for this item?", "value": "We'd be happy to help. Can you please provide the product name or SKU so we can provide you with the proper washing instructions?"}, {"key": "Can I get a replacement part for my product?", "value": "Certainly. Can you please provide the product name, SKU, or serial number, and a description of the part you need?"}, {"key": "Do you offer free shipping?", "value": "We do offer free shipping on select orders. Can you please provide the product name or SKU and your shipping destination so we can check if your order qualifies?"}, {"key": "Can I place a custom order?", "value": "We'd be happy to assist you. Can you please provide the product name or SKU and a description of the customizations you'd like?"}, {"key": "How do I report a problem with your website?", "value": "We appreciate your feedback. Can you please provide a description of the issue you're experiencing and your email address so we can follow up with you?"}, {"key": "What is your policy on price adjustments?", "value": "We'd be happy to provide information on our price adjustment policy. Can you please provide the product name or SKU so we can give you the most accurate information?"}, {"key": "Do you have any upcoming sales or events?", "value": "We'd be happy to inform you of any upcoming sales or events. Can you please provide your email address so we can keep you updated?"}, {"key": "How do I schedule a consultation or appointment?", "value": "We'd be happy to help. Can you please provide your name, phone number, and the service you're interested in so we can schedule your appointment?"}, {"key": "Can I get a copy of my receipt?", "value": "Certainly. Can you please provide your order number or account email so we can locate your receipt and send you a copy?"}, {"key": "Can I use a competitor's coupon at your store?", "value": "In some cases, we may accept competitor coupons. Can you please provide the competitor's coupon code and the product name or SKU you'd like to apply it to?"}, {"key": "Do you have a recycling program?", "value": "Yes, we do have a recycling program. Can you please provide your email address so we can send you information on how to participate?"}, {"key": "How do I report a lost or stolen gift card?", "value": "We're sorry to hear that. Can you please provide the gift card number, if available, and your email address so we can assist you further?"}]}
diff --git a/applications/ColossalQA/data/data_sample/luchen_zh.txt b/applications/ColossalQA/data/data_sample/luchen_zh.txt
new file mode 100644
index 000000000000..afd7fc306fad
--- /dev/null
+++ b/applications/ColossalQA/data/data_sample/luchen_zh.txt
@@ -0,0 +1 @@
+潞晨科技是一家致力于“解放AI生产力”的全球性公司,技术团队核心成员来自美国加州伯克利、斯坦福、新加坡国立、南洋理工、清华、北大等国内外知名高校。在高性能计算、人工智能、分布式系统等方面已有十余年的技术积累,并在国际顶级学术刊物或会议发表论文近百篇。公司核心产品面向大模型时代的通用深度学习系统 Colossal-AI,可实现高效快速部署AI大模型训练和推理,降低AI大模型应用成本。公司在种子轮、天使轮融资已获得“清科中国早期投资机构30强”前三甲创新工场、真格基金、蓝驰创投的600万美元投资。
diff --git a/applications/ColossalQA/data/tests/64KB.json b/applications/ColossalQA/data/tests/64KB.json
new file mode 100644
index 000000000000..99278dc5c79a
--- /dev/null
+++ b/applications/ColossalQA/data/tests/64KB.json
@@ -0,0 +1,7 @@
+{
+ "data":[
+ {"content":"Donec lobortis eleifend condimentum. Cras dictum dolor lacinia lectus vehicula rutrum. Maecenas quis nisi nunc. Nam tristique feugiat est vitae mollis. Maecenas quis nisi nunc."},
+ {"content":"Aliquam sollicitudin ante ligula, eget malesuada nibh efficitur et. Pellentesque massa sem, scelerisque sit amet odio id, cursus tempor urna. Etiam congue dignissim volutpat. Vestibulum pharetra libero et velit gravida euismod."}
+ ],
+ "name":"player"
+}
\ No newline at end of file
diff --git a/applications/ColossalQA/data/tests/companies.csv b/applications/ColossalQA/data/tests/companies.csv
new file mode 100644
index 000000000000..93dcac9f39ae
--- /dev/null
+++ b/applications/ColossalQA/data/tests/companies.csv
@@ -0,0 +1,101 @@
+Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
+1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498
+2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952
+3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287
+4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921
+5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870
+6,cC757116fe1C085,Henry-Thompson,http://morse.net/,Bahamas,Face-to-face well-modulated customer loyalty,1992,Primary / Secondary Education,4914
+7,219233e8aFF1BC3,Hansen-Everett,https://www.kidd.org/,Pakistan,Seamless disintermediate collaboration,2018,Publishing Industry,7832
+8,ccc93DCF81a31CD,Mcintosh-Mora,https://www.brooks.com/,Heard Island and McDonald Islands,Centralized attitude-oriented capability,1970,Import / Export,4389
+9,0B4F93aA06ED03e,Carr Inc,http://ross.com/,Kuwait,Distributed impactful customer loyalty,1996,Plastics,8167
+10,738b5aDe6B1C6A5,Gaines Inc,http://sandoval-hooper.com/,Uzbekistan,Multi-lateral scalable protocol,1997,Outsourcing / Offshoring,9698
+11,AE61b8Ffebbc476,Kidd Group,http://www.lyons.com/,Bouvet Island (Bouvetoya),Proactive foreground paradigm,2001,Primary / Secondary Education,7473
+12,eb3B7D06cCdD609,Crane-Clarke,https://www.sandoval.com/,Denmark,Front-line clear-thinking encryption,2014,Food / Beverages,9011
+13,8D0c29189C9798B,"Keller, Campos and Black",https://www.garner.info/,Liberia,Ameliorated directional emulation,2020,Museums / Institutions,2862
+14,D2c91cc03CA394c,Glover-Pope,http://www.silva.biz/,United Arab Emirates,Persevering contextually-based approach,2013,Medical Practice,9079
+15,C8AC1eaf9C036F4,Pacheco-Spears,https://aguilar.com/,Sweden,Secured logistical synergy,1984,Maritime,769
+16,b5D10A14f7a8AfE,Hodge-Ayers,http://www.archer-elliott.com/,Honduras,Future-proofed radical implementation,1990,Facilities Services,8508
+17,68139b5C4De03B4,"Bowers, Guerra and Krause",http://www.carrillo-nicholson.com/,Uganda,De-engineered transitional strategy,1972,Primary / Secondary Education,6986
+18,5c2EffEfdba2BdF,Mckenzie-Melton,http://montoya-thompson.com/,Hong Kong,Reverse-engineered heuristic alliance,1998,Investment Management / Hedge Fund / Private Equity,4589
+19,ba179F19F7925f5,Branch-Mann,http://www.lozano.com/,Botswana,Adaptive intangible frame,1999,Architecture / Planning,7961
+20,c1Ce9B350BAc66b,Weiss and Sons,https://barrett.com/,Korea,Sharable optimal functionalities,2011,Plastics,5984
+21,8de40AC4e6EaCa4,"Velez, Payne and Coffey",http://burton.com/,Luxembourg,Mandatory coherent synergy,1986,Wholesale,5010
+22,Aad86a4F0385F2d,Harrell LLC,http://www.frey-rosario.com/,Guadeloupe,Reverse-engineered mission-critical moratorium,2018,Construction,2185
+23,22aC3FFd64fD703,"Eaton, Reynolds and Vargas",http://www.freeman.biz/,Monaco,Self-enabling multi-tasking process improvement,2014,Luxury Goods / Jewelry,8987
+24,5Ec4C272bCf085c,Robbins-Cummings,http://donaldson-wilkins.com/,Belgium,Organic non-volatile hierarchy,1991,Pharmaceuticals,5038
+25,5fDBeA8BB91a000,Jenkins Inc,http://www.kirk.biz/,South Africa,Front-line systematic help-desk,2002,Insurance,1215
+26,dFfD6a6F9AC2d9C,"Greene, Benjamin and Novak",http://www.kent.net/,Romania,Centralized leadingedge moratorium,2012,Museums / Institutions,4941
+27,4B217cC5a0674C5,"Dickson, Richmond and Clay",http://everett.com/,Czech Republic,Team-oriented tangible complexity,1980,Real Estate / Mortgage,3122
+28,88b1f1cDcf59a37,Prince-David,http://thompson.com/,Christmas Island,Virtual holistic methodology,1970,Banking / Mortgage,1046
+29,f9F7bBCAEeC360F,Ayala LLC,http://www.zhang.com/,Philippines,Open-source zero administration hierarchy,2021,Legal Services,7664
+30,7Cb3AeFcE4Ba31e,Rivas Group,https://hebert.org/,Australia,Open-architected well-modulated capacity,1998,Logistics / Procurement,4155
+31,ccBcC32adcbc530,"Sloan, Mays and Whitehead",http://lawson.com/,Chad,Face-to-face high-level conglomeration,1997,Civil Engineering,365
+32,f5afd686b3d05F5,"Durham, Allen and Barnes",http://chan-stafford.org/,Zimbabwe,Synergistic web-enabled framework,1993,Mechanical or Industrial Engineering,6135
+33,38C6cfC5074Fa5e,Fritz-Franklin,http://www.lambert.com/,Nepal,Automated 4thgeneration website,1972,Hospitality,4516
+34,5Cd7efccCcba38f,Burch-Ewing,http://cline.net/,Taiwan,User-centric 4thgeneration system engine,1981,Venture Capital / VC,7443
+35,9E6Acb51e3F9d6F,"Glass, Barrera and Turner",https://dunlap.com/,Kyrgyz Republic,Multi-channeled 3rdgeneration open system,2020,Utilities,2610
+36,4D4d7E18321eaeC,Pineda-Cox,http://aguilar.org/,Bolivia,Fundamental asynchronous capability,2010,Human Resources / HR,1312
+37,485f5d06B938F2b,"Baker, Mccann and Macdonald",http://www.anderson-barker.com/,Kenya,Cross-group user-facing focus group,2013,Legislative Office,1638
+38,19E3a5Bf6dBDc4F,Cuevas-Moss,https://dodson-castaneda.net/,Guatemala,Extended human-resource intranet,1994,Music,9995
+39,6883A965c7b68F7,Hahn PLC,http://newman.com/,Belarus,Organic logistical leverage,2012,Electrical / Electronic Manufacturing,3715
+40,AC5B7AA74Aa4A2E,"Valentine, Ferguson and Kramer",http://stuart.net/,Jersey,Centralized secondary time-frame,1997,Non - Profit / Volunteering,3585
+41,decab0D5027CA6a,Arroyo Inc,https://www.turner.com/,Grenada,Managed demand-driven website,2006,Writing / Editing,9067
+42,dF084FbBb613eea,Walls LLC,http://www.reese-vasquez.biz/,Cape Verde,Self-enabling fresh-thinking installation,1989,Investment Management / Hedge Fund / Private Equity,1678
+43,A2D89Ab9bCcAd4e,"Mitchell, Warren and Schneider",https://fox.biz/,Trinidad and Tobago,Enhanced intangible time-frame,2021,Capital Markets / Hedge Fund / Private Equity,3816
+44,77aDc905434a49f,Prince PLC,https://www.watts.com/,Sweden,Profit-focused coherent installation,2016,Individual / Family Services,7645
+45,235fdEFE2cfDa5F,Brock-Blackwell,http://www.small.com/,Benin,Secured foreground emulation,1986,Online Publishing,7034
+46,1eD64cFe986BBbE,Walton-Barnett,https://ashley-schaefer.com/,Western Sahara,Right-sized clear-thinking flexibility,2001,Luxury Goods / Jewelry,1746
+47,CbBbFcdd0eaE2cF,Bartlett-Arroyo,https://cruz.com/,Northern Mariana Islands,Realigned didactic function,1976,Civic / Social Organization,3987
+48,49aECbDaE6aBD53,"Wallace, Madden and Morris",http://www.blevins-fernandez.biz/,Germany,Persistent real-time customer loyalty,2016,Pharmaceuticals,9443
+49,7b3fe6e7E72bFa4,Berg-Sparks,https://cisneros-love.com/,Canada,Stand-alone static implementation,1974,Arts / Crafts,2073
+50,c6DedA82A8aef7E,Gonzales Ltd,http://bird.com/,Tonga,Managed human-resource policy,1988,Consumer Goods,9069
+51,7D9FBF85cdC3871,Lawson and Sons,https://www.wong.com/,French Southern Territories,Compatible analyzing intranet,2021,Arts / Crafts,3527
+52,7dd18Fb7cB07b65,"Mcguire, Mcconnell and Olsen",https://melton-briggs.com/,Korea,Profound client-server frame,1988,Printing,8445
+53,EF5B55FadccB8Fe,Charles-Phillips,https://bowman.com/,Cote d'Ivoire,Monitored client-server implementation,2012,Mental Health Care,3450
+54,f8D4B99e11fAF5D,Odom Ltd,https://www.humphrey-hess.com/,Cote d'Ivoire,Advanced static process improvement,2012,Management Consulting,1825
+55,e24D21BFd3bF1E5,Richard PLC,https://holden-coleman.net/,Mayotte,Object-based optimizing model,1971,Broadcast Media,4942
+56,B9BdfEB6D3Ca44E,Sampson Ltd,https://blevins.com/,Cayman Islands,Intuitive local adapter,2005,Farming,1418
+57,2a74D6f3D3B268e,"Cherry, Le and Callahan",https://waller-delacruz.biz/,Nigeria,Universal human-resource collaboration,2017,Entertainment / Movie Production,7202
+58,Bf3F3f62c8aBC33,Cherry PLC,https://www.avila.info/,Marshall Islands,Persistent tertiary website,1980,Plastics,8245
+59,aeBe26B80a7a23c,Melton-Nichols,https://kennedy.com/,Palau,User-friendly clear-thinking productivity,2021,Legislative Office,8741
+60,aAeb29ad43886C6,Potter-Walsh,http://thomas-french.org/,Turkey,Optional non-volatile open system,2008,Human Resources / HR,6923
+61,bD1bc6bB6d1FeD3,Freeman-Chen,https://mathis.com/,Timor-Leste,Phased next generation adapter,1973,International Trade / Development,346
+62,EB9f456e8b7022a,Soto Group,https://norris.info/,Vietnam,Enterprise-wide executive installation,1988,Business Supplies / Equipment,9097
+63,Dfef38C51D8DAe3,"Poole, Cruz and Whitney",https://reed.info/,Reunion,Balanced analyzing groupware,1978,Marketing / Advertising / Sales,2992
+64,055ffEfB2Dd95B0,Riley Ltd,http://wiley.com/,Brazil,Optional exuding superstructure,1986,Textiles,9315
+65,cBfe4dbAE1699da,"Erickson, Andrews and Bailey",https://www.hobbs-grant.com/,Eritrea,Vision-oriented secondary project,2014,Consumer Electronics,7829
+66,fdFbecbadcdCdf1,"Wilkinson, Charles and Arroyo",http://hunter-mcfarland.com/,United States Virgin Islands,Assimilated 24/7 archive,1996,Building Materials,602
+67,5DCb8A5a5ca03c0,Floyd Ltd,http://www.whitney.com/,Falkland Islands (Malvinas),Function-based fault-tolerant concept,2017,Public Relations / PR,2911
+68,ce57DCbcFD6d618,Newman-Galloway,https://www.scott.com/,Luxembourg,Enhanced foreground collaboration,1987,Information Technology / IT,3934
+69,5aaD187dc929371,Frazier-Butler,https://www.daugherty-farley.info/,Northern Mariana Islands,Persistent interactive circuit,1972,Outsourcing / Offshoring,5130
+70,902D7Ac8b6d476b,Newton Inc,https://www.richmond-manning.info/,Netherlands Antilles,Fundamental stable info-mediaries,1976,Military Industry,563
+71,32BB9Ff4d939788,Duffy-Levy,https://www.potter.com/,Guernsey,Diverse exuding installation,1982,Wireless,6146
+72,adcB0afbE58bAe3,Wagner LLC,https://decker-esparza.com/,Uruguay,Reactive attitude-oriented toolset,1987,International Affairs,6874
+73,dfcA1c84AdB61Ac,Mccall-Holmes,http://www.dean.com/,Benin,Object-based value-added database,2009,Legal Services,696
+74,208044AC2fe52F3,Massey LLC,https://frazier.biz/,Suriname,Configurable zero administration Graphical User Interface,1986,Accounting,5004
+75,f3C365f0c1A0623,Hicks LLC,http://alvarez.biz/,Pakistan,Quality-focused client-server Graphical User Interface,1970,Computer Software / Engineering,8480
+76,ec5Bdd3CBAfaB93,"Cole, Russell and Avery",http://www.blankenship.com/,Mongolia,De-engineered fault-tolerant challenge,2000,Law Enforcement,7012
+77,DDB19Be7eeB56B4,Cummings-Rojas,https://simon-pearson.com/,Svalbard & Jan Mayen Islands,User-centric modular customer loyalty,2012,Financial Services,7529
+78,dd6CA3d0bc3cAfc,"Beasley, Greene and Mahoney",http://www.petersen-lawrence.com/,Togo,Extended content-based methodology,1976,Religious Institutions,869
+79,A0B9d56e61070e3,"Beasley, Sims and Allison",http://burke.info/,Latvia,Secured zero tolerance hub,1972,Facilities Services,6182
+80,cBa7EFe5D05Adaf,Crawford-Rivera,https://black-ramirez.org/,Cuba,Persevering exuding budgetary management,1999,Online Publishing,7805
+81,Ea3f6D52Ec73563,Montes-Hensley,https://krueger.org/,Liechtenstein,Multi-tiered secondary productivity,2009,Printing,8433
+82,bC0CEd48A8000E0,Velazquez-Odom,https://stokes.com/,Djibouti,Streamlined 6thgeneration function,2002,Alternative Dispute Resolution,4044
+83,c89b9b59BC4baa1,Eaton-Morales,https://www.reeves-graham.com/,Micronesia,Customer-focused explicit frame,1990,Capital Markets / Hedge Fund / Private Equity,7013
+84,FEC51bce8421a7b,"Roberson, Pennington and Palmer",http://www.keith-fisher.com/,Cameroon,Adaptive bi-directional hierarchy,1993,Telecommunications,5571
+85,e0E8e27eAc9CAd5,"George, Russo and Guerra",https://drake.com/,Sweden,Centralized non-volatile capability,1989,Military Industry,2880
+86,B97a6CF9bf5983C,Davila Inc,https://mcconnell.info/,Cocos (Keeling) Islands,Profit-focused dedicated frame,2017,Consumer Electronics,2215
+87,a0a6f9b3DbcBEb5,Mays-Preston,http://www.browning-key.com/,Mali,User-centric heuristic focus group,2006,Military Industry,5786
+88,8cC1bDa330a5871,Pineda-Morton,https://www.carr.com/,United States Virgin Islands,Grass-roots methodical info-mediaries,1991,Printing,6168
+89,ED889CB2FE9cbd3,Huang and Sons,https://www.bolton.com/,Eritrea,Re-contextualized dynamic hierarchy,1981,Semiconductors,7484
+90,F4Dc1417BC6cb8f,Gilbert-Simon,https://www.bradford.biz/,Burundi,Grass-roots radical parallelism,1973,Newspapers / Journalism,1927
+91,7ABc3c7ecA03B34,Sampson-Griffith,http://hendricks.org/,Benin,Multi-layered composite paradigm,1972,Textiles,3881
+92,4e0719FBE38e0aB,Miles-Dominguez,http://www.turner.com/,Gibraltar,Organized empowering forecast,1996,Civic / Social Organization,897
+93,dEbDAAeDfaed00A,Rowe and Sons,https://www.simpson.org/,El Salvador,Balanced multimedia knowledgebase,1978,Facilities Services,8172
+94,61BDeCfeFD0cEF5,"Valenzuela, Holmes and Rowland",https://www.dorsey.net/,Taiwan,Persistent tertiary focus group,1999,Transportation,1483
+95,4e91eD25f486110,"Best, Wade and Shepard",https://zimmerman.com/,Zimbabwe,Innovative background definition,1991,Gambling / Casinos,4873
+96,0a0bfFbBbB8eC7c,Holmes Group,https://mcdowell.org/,Ethiopia,Right-sized zero tolerance focus group,1975,Photography,2988
+97,BA6Cd9Dae2Efd62,Good Ltd,http://duffy.com/,Anguilla,Reverse-engineered composite moratorium,1971,Consumer Services,4292
+98,E7df80C60Abd7f9,Clements-Espinoza,http://www.flowers.net/,Falkland Islands (Malvinas),Progressive modular hub,1991,Broadcast Media,236
+99,AFc285dbE2fEd24,Mendez Inc,https://www.burke.net/,Kyrgyz Republic,User-friendly exuding migration,1993,Education Management,339
+100,e9eB5A60Cef8354,Watkins-Kaiser,http://www.herring.com/,Togo,Synergistic background access,2009,Financial Services,2785
diff --git a/applications/ColossalQA/data/tests/sample-pdf-file.pdf b/applications/ColossalQA/data/tests/sample-pdf-file.pdf
new file mode 100644
index 000000000000..4b6eea24d6ea
Binary files /dev/null and b/applications/ColossalQA/data/tests/sample-pdf-file.pdf differ
diff --git a/applications/ColossalQA/data/tests/test.html b/applications/ColossalQA/data/tests/test.html
new file mode 100644
index 000000000000..5ad21421d827
--- /dev/null
+++ b/applications/ColossalQA/data/tests/test.html
@@ -0,0 +1,1970 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ logging — Logging facility for Python — Python 3.11.5 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
This module defines functions and classes which implement a flexible event
+logging system for applications and libraries.
+
The key benefit of having the logging API provided by a standard library module
+is that all Python modules can participate in logging, so your application log
+can include your own messages integrated with messages from third-party
+modules.
The module provides a lot of functionality and flexibility. If you are
+unfamiliar with logging, the best way to get to grips with it is to view the
+tutorials (see the links above and on the right).
+
The basic classes defined by the module, together with their functions, are
+listed below.
+
+
Loggers expose the interface that application code directly uses.
+
Handlers send the log records (created by loggers) to the appropriate
+destination.
+
Filters provide a finer grained facility for determining which log records
+to output.
+
Formatters specify the layout of log records in the final output.
Loggers have the following attributes and methods. Note that Loggers should
+NEVER be instantiated directly, but always through the module-level function
+logging.getLogger(name). Multiple calls to getLogger() with the same
+name will always return a reference to the same Logger object.
+
The name is potentially a period-separated hierarchical value, like
+foo.bar.baz (though it could also be just plain foo, for example).
+Loggers that are further down in the hierarchical list are children of loggers
+higher up in the list. For example, given a logger with a name of foo,
+loggers with names of foo.bar, foo.bar.baz, and foo.bam are all
+descendants of foo. The logger name hierarchy is analogous to the Python
+package hierarchy, and identical to it if you organise your loggers on a
+per-module basis using the recommended construction
+logging.getLogger(__name__). That’s because in a module, __name__
+is the module’s name in the Python package namespace.
If this attribute evaluates to true, events logged to this logger will be
+passed to the handlers of higher level (ancestor) loggers, in addition to
+any handlers attached to this logger. Messages are passed directly to the
+ancestor loggers’ handlers - neither the level nor filters of the ancestor
+loggers in question are considered.
+
If this evaluates to false, logging messages are not passed to the handlers
+of ancestor loggers.
+
Spelling it out with an example: If the propagate attribute of the logger named
+A.B.C evaluates to true, any event logged to A.B.C via a method call such as
+logging.getLogger('A.B.C').error(...) will [subject to passing that logger’s
+level and filter settings] be passed in turn to any handlers attached to loggers
+named A.B, A and the root logger, after first being passed to any handlers
+attached to A.B.C. If any logger in the chain A.B.C, A.B, A has its
+propagate attribute set to false, then that is the last logger whose handlers
+are offered the event to handle, and propagation stops at that point.
+
The constructor sets this attribute to True.
+
+
Note
+
If you attach a handler to a logger and one or more of its
+ancestors, it may emit the same record multiple times. In general, you
+should not need to attach a handler to more than one logger - if you just
+attach it to the appropriate logger which is highest in the logger
+hierarchy, then it will see all events logged by all descendant loggers,
+provided that their propagate setting is left set to True. A common
+scenario is to attach handlers only to the root logger, and to let
+propagation take care of the rest.
Sets the threshold for this logger to level. Logging messages which are less
+severe than level will be ignored; logging messages which have severity level
+or higher will be emitted by whichever handler or handlers service this logger,
+unless a handler’s level has been set to a higher severity level than level.
+
When a logger is created, the level is set to NOTSET (which causes
+all messages to be processed when the logger is the root logger, or delegation
+to the parent when the logger is a non-root logger). Note that the root logger
+is created with level WARNING.
+
The term ‘delegation to the parent’ means that if a logger has a level of
+NOTSET, its chain of ancestor loggers is traversed until either an ancestor with
+a level other than NOTSET is found, or the root is reached.
+
If an ancestor is found with a level other than NOTSET, then that ancestor’s
+level is treated as the effective level of the logger where the ancestor search
+began, and is used to determine how a logging event is handled.
+
If the root is reached, and it has a level of NOTSET, then all messages will be
+processed. Otherwise, the root’s level will be used as the effective level.
Changed in version 3.2: The level parameter now accepts a string representation of the
+level such as ‘INFO’ as an alternative to the integer constants
+such as INFO. Note, however, that levels are internally stored
+as integers, and methods such as e.g. getEffectiveLevel() and
+isEnabledFor() will return/expect to be passed integers.
Indicates if a message of severity level would be processed by this logger.
+This method checks first the module-level level set by
+logging.disable(level) and then the logger’s effective level as determined
+by getEffectiveLevel().
Indicates the effective level for this logger. If a value other than
+NOTSET has been set using setLevel(), it is returned. Otherwise,
+the hierarchy is traversed towards the root until a value other than
+NOTSET is found, and that value is returned. The value returned is
+an integer, typically one of logging.DEBUG, logging.INFO
+etc.
Returns a logger which is a descendant to this logger, as determined by the suffix.
+Thus, logging.getLogger('abc').getChild('def.ghi') would return the same
+logger as would be returned by logging.getLogger('abc.def.ghi'). This is a
+convenience method, useful when the parent logger is named using e.g. __name__
+rather than a literal string.
Logs a message with level DEBUG on this logger. The msg is the
+message format string, and the args are the arguments which are merged into
+msg using the string formatting operator. (Note that this means that you can
+use keywords in the format string, together with a single dictionary argument.)
+No % formatting operation is performed on msg when no args are supplied.
+
There are four keyword arguments in kwargs which are inspected:
+exc_info, stack_info, stacklevel and extra.
+
If exc_info does not evaluate as false, it causes exception information to be
+added to the logging message. If an exception tuple (in the format returned by
+sys.exc_info()) or an exception instance is provided, it is used;
+otherwise, sys.exc_info() is called to get the exception information.
+
The second optional keyword argument is stack_info, which defaults to
+False. If true, stack information is added to the logging
+message, including the actual logging call. Note that this is not the same
+stack information as that displayed through specifying exc_info: The
+former is stack frames from the bottom of the stack up to the logging call
+in the current thread, whereas the latter is information about stack frames
+which have been unwound, following an exception, while searching for
+exception handlers.
+
You can specify stack_info independently of exc_info, e.g. to just show
+how you got to a certain point in your code, even when no exceptions were
+raised. The stack frames are printed following a header line which says:
+
Stack (most recent call last):
+
+
+
This mimics the Traceback(mostrecentcalllast): which is used when
+displaying exception frames.
+
The third optional keyword argument is stacklevel, which defaults to 1.
+If greater than 1, the corresponding number of stack frames are skipped
+when computing the line number and function name set in the LogRecord
+created for the logging event. This can be used in logging helpers so that
+the function name, filename and line number recorded are not the information
+for the helper function/method, but rather its caller. The name of this
+parameter mirrors the equivalent one in the warnings module.
+
The fourth keyword argument is extra which can be used to pass a
+dictionary which is used to populate the __dict__ of the LogRecord
+created for the logging event with user-defined attributes. These custom
+attributes can then be used as you like. For example, they could be
+incorporated into logged messages. For example:
The keys in the dictionary passed in extra should not clash with the keys used
+by the logging system. (See the section on LogRecord attributes for more
+information on which keys are used by the logging system.)
+
If you choose to use these attributes in logged messages, you need to exercise
+some care. In the above example, for instance, the Formatter has been
+set up with a format string which expects ‘clientip’ and ‘user’ in the attribute
+dictionary of the LogRecord. If these are missing, the message will
+not be logged because a string formatting exception will occur. So in this case,
+you always need to pass the extra dictionary with these keys.
+
While this might be annoying, this feature is intended for use in specialized
+circumstances, such as multi-threaded servers where the same code executes in
+many contexts, and interesting conditions which arise are dependent on this
+context (such as remote client IP address and authenticated user name, in the
+above example). In such circumstances, it is likely that specialized
+Formatters would be used with particular Handlers.
+
If no handler is attached to this logger (or any of its ancestors,
+taking into account the relevant Logger.propagate attributes),
+the message will be sent to the handler set on lastResort.
+
+
Changed in version 3.2: The stack_info parameter was added.
+
+
+
Changed in version 3.5: The exc_info parameter can now accept exception instances.
+
+
+
Changed in version 3.8: The stacklevel parameter was added.
Logs a message with level ERROR on this logger. The arguments are
+interpreted as for debug(). Exception info is added to the logging
+message. This method should only be called from an exception handler.
Apply this logger’s filters to the record and return True if the
+record is to be processed. The filters are consulted in turn, until one of
+them returns a false value. If none of them return a false value, the record
+will be processed (passed to handlers). If one returns a false value, no
+further processing of the record occurs.
Finds the caller’s source filename and line number. Returns the filename, line
+number, function name and stack information as a 4-element tuple. The stack
+information is returned as None unless stack_info is True.
+
The stacklevel parameter is passed from code calling the debug()
+and other APIs. If greater than 1, the excess is used to skip stack frames
+before determining the values to be returned. This will generally be useful
+when calling logging APIs from helper/wrapper code, so that the information
+in the event log refers not to the helper/wrapper code, but to the code that
+calls it.
Handles a record by passing it to all handlers associated with this logger and
+its ancestors (until a false value of propagate is found). This method is used
+for unpickled records received from a socket, as well as those created locally.
+Logger-level filtering is applied using filter().
Checks to see if this logger has any handlers configured. This is done by
+looking for handlers in this logger and its parents in the logger hierarchy.
+Returns True if a handler was found, else False. The method stops searching
+up the hierarchy whenever a logger with the ‘propagate’ attribute set to
+false is found - that will be the last logger which is checked for the
+existence of handlers.
+
+
New in version 3.2.
+
+
+
+
+
Changed in version 3.7: Loggers can now be pickled and unpickled.
The numeric values of logging levels are given in the following table. These are
+primarily of interest if you want to define your own levels, and need them to
+have specific values relative to the predefined levels. If you define a level
+with the same numeric value, it overwrites the predefined value; the predefined
+name is lost.
When set on a logger, indicates that
+ancestor loggers are to be consulted
+to determine the effective level.
+If that still resolves to
+NOTSET, then all events
+are logged. When set on a handler,
+all events are handled.
An indication that something
+unexpected happened, or that a
+problem might occur in the near
+future (e.g. ‘disk space low’). The
+software is still working as
+expected.
Handlers have the following attributes and methods. Note that Handler
+is never instantiated directly; this class acts as a base for more useful
+subclasses. However, the __init__() method in subclasses needs to call
+Handler.__init__().
Initializes the Handler instance by setting its level, setting the list
+of filters to the empty list and creating a lock (using createLock()) for
+serializing access to an I/O mechanism.
Sets the threshold for this handler to level. Logging messages which are
+less severe than level will be ignored. When a handler is created, the
+level is set to NOTSET (which causes all messages to be
+processed).
Changed in version 3.2: The level parameter now accepts a string representation of the
+level such as ‘INFO’ as an alternative to the integer constants
+such as INFO.
Apply this handler’s filters to the record and return True if the
+record is to be processed. The filters are consulted in turn, until one of
+them returns a false value. If none of them return a false value, the record
+will be emitted. If one returns a false value, the handler will not emit the
+record.
Tidy up any resources used by the handler. This version does no output but
+removes the handler from an internal list of handlers which is closed when
+shutdown() is called. Subclasses should ensure that this gets called
+from overridden close() methods.
Conditionally emits the specified logging record, depending on filters which may
+have been added to the handler. Wraps the actual emission of the record with
+acquisition/release of the I/O thread lock.
This method should be called from handlers when an exception is encountered
+during an emit() call. If the module-level attribute
+raiseExceptions is False, exceptions get silently ignored. This is
+what is mostly wanted for a logging system - most users will not care about
+errors in the logging system, they are more interested in application
+errors. You could, however, replace this with a custom handler if you wish.
+The specified record is the one which was being processed when the exception
+occurred. (The default value of raiseExceptions is True, as that is
+more useful during development).
Do whatever it takes to actually log the specified logging record. This version
+is intended to be implemented by subclasses and so raises a
+NotImplementedError.
+
+
Warning
+
This method is called after a handler-level lock is acquired, which
+is released after this method returns. When you override this method, note
+that you should be careful when calling anything that invokes other parts of
+the logging API which might do locking, because that might result in a
+deadlock. Specifically:
+
+
Logging configuration APIs acquire the module-level lock, and then
+individual handler-level locks as those handlers are configured.
+
Many logging APIs lock the module-level lock. If such an API is called
+from this method, it could cause a deadlock if a configuration call is
+made on another thread, because that thread will try to acquire the
+module-level lock before the handler-level lock, whereas this thread
+tries to acquire the module-level lock after the handler-level lock
+(because in this method, the handler-level lock has already been acquired).
+
+
+
+
+
+
+
For a list of handlers included as standard, see logging.handlers.
Formatter objects have the following attributes and methods. They are
+responsible for converting a LogRecord to (usually) a string which can
+be interpreted by either a human or an external system. The base
+Formatter allows a formatting string to be specified. If none is
+supplied, the default value of '%(message)s' is used, which just includes
+the message in the logging call. To have additional items of information in the
+formatted output (such as a timestamp), keep reading.
+
A Formatter can be initialized with a format string which makes use of knowledge
+of the LogRecord attributes - such as the default value mentioned above
+making use of the fact that the user’s message and arguments are pre-formatted
+into a LogRecord’s message attribute. This format string contains
+standard Python %-style mapping keys. See section printf-style String Formatting
+for more information on string formatting.
Returns a new instance of the Formatter class. The instance is
+initialized with a format string for the message as a whole, as well as a
+format string for the date/time portion of a message. If no fmt is
+specified, '%(message)s' is used. If no datefmt is specified, a format
+is used which is described in the formatTime() documentation.
+
The style parameter can be one of ‘%’, ‘{’ or ‘$’ and determines how
+the format string will be merged with its data: using one of %-formatting,
+str.format() or string.Template. This only applies to the
+format string fmt (e.g. '%(message)s' or {message}), not to the
+actual log messages passed to Logger.debug etc; see
+Using particular formatting styles throughout your application for more information on using {- and $-formatting
+for log messages.
+
The defaults parameter can be a dictionary with default values to use in
+custom fields. For example:
+logging.Formatter('%(ip)s%(message)s',defaults={"ip":None})
+
+
Changed in version 3.2: The style parameter was added.
+
+
+
Changed in version 3.8: The validate parameter was added. Incorrect or mismatched style and fmt
+will raise a ValueError.
+For example: logging.Formatter('%(asctime)s-%(message)s',style='{').
+
+
+
Changed in version 3.10: The defaults parameter was added.
The record’s attribute dictionary is used as the operand to a string
+formatting operation. Returns the resulting string. Before formatting the
+dictionary, a couple of preparatory steps are carried out. The message
+attribute of the record is computed using msg % args. If the
+formatting string contains '(asctime)', formatTime() is called
+to format the event time. If there is exception information, it is
+formatted using formatException() and appended to the message. Note
+that the formatted exception information is cached in attribute
+exc_text. This is useful because the exception information can be
+pickled and sent across the wire, but you should be careful if you have
+more than one Formatter subclass which customizes the formatting
+of exception information. In this case, you will have to clear the cached
+value (by setting the exc_text attribute to None) after a formatter
+has done its formatting, so that the next formatter to handle the event
+doesn’t use the cached value, but recalculates it afresh.
+
If stack information is available, it’s appended after the exception
+information, using formatStack() to transform it if necessary.
This method should be called from format() by a formatter which
+wants to make use of a formatted time. This method can be overridden in
+formatters to provide for any specific requirement, but the basic behavior
+is as follows: if datefmt (a string) is specified, it is used with
+time.strftime() to format the creation time of the
+record. Otherwise, the format ‘%Y-%m-%d %H:%M:%S,uuu’ is used, where the
+uuu part is a millisecond value and the other letters are as per the
+time.strftime() documentation. An example time in this format is
+2003-01-2300:29:50,411. The resulting string is returned.
+
This function uses a user-configurable function to convert the creation
+time to a tuple. By default, time.localtime() is used; to change
+this for a particular formatter instance, set the converter attribute
+to a function with the same signature as time.localtime() or
+time.gmtime(). To change it for all formatters, for example if you
+want all logging times to be shown in GMT, set the converter
+attribute in the Formatter class.
+
+
Changed in version 3.3: Previously, the default format was hard-coded as in this example:
+2010-09-0622:38:15,292 where the part before the comma is
+handled by a strptime format string ('%Y-%m-%d%H:%M:%S'), and the
+part after the comma is a millisecond value. Because strptime does not
+have a format placeholder for milliseconds, the millisecond value is
+appended using another format string, '%s,%03d' — and both of these
+format strings have been hardcoded into this method. With the change,
+these strings are defined as class-level attributes which can be
+overridden at the instance level when desired. The names of the
+attributes are default_time_format (for the strptime format string)
+and default_msec_format (for appending the millisecond value).
+
+
+
Changed in version 3.9: The default_msec_format can be None.
Formats the specified exception information (a standard exception tuple as
+returned by sys.exc_info()) as a string. This default implementation
+just uses traceback.print_exception(). The resulting string is
+returned.
Formats the specified stack information (a string as returned by
+traceback.print_stack(), but with the last newline removed) as a
+string. This default implementation just returns the input value.
A base formatter class suitable for subclassing when you want to format a
+number of records. You can pass a Formatter instance which you want
+to use to format each line (that corresponds to a single record). If not
+specified, the default formatter (which just outputs the event message) is
+used as the line formatter.
Return a header for a list of records. The base implementation just
+returns the empty string. You will need to override this method if you
+want specific behaviour, e.g. to show the count of records, a title or a
+separator line.
Return a footer for a list of records. The base implementation just
+returns the empty string. You will need to override this method if you
+want specific behaviour, e.g. to show the count of records or a separator
+line.
Return formatted text for a list of records. The base implementation
+just returns the empty string if there are no records; otherwise, it
+returns the concatenation of the header, each record formatted with the
+line formatter, and the footer.
Filters can be used by Handlers and Loggers for more sophisticated
+filtering than is provided by levels. The base filter class only allows events
+which are below a certain point in the logger hierarchy. For example, a filter
+initialized with ‘A.B’ will allow events logged by loggers ‘A.B’, ‘A.B.C’,
+‘A.B.C.D’, ‘A.B.D’ etc. but not ‘A.BB’, ‘B.A.B’ etc. If initialized with the
+empty string, all events are passed.
Returns an instance of the Filter class. If name is specified, it
+names a logger which, together with its children, will have its events allowed
+through the filter. If name is the empty string, allows every event.
Is the specified record to be logged? Returns zero for no, nonzero for
+yes. If deemed appropriate, the record may be modified in-place by this
+method.
+
+
+
+
+
Note that filters attached to handlers are consulted before an event is
+emitted by the handler, whereas filters attached to loggers are consulted
+whenever an event is logged (using debug(), info(),
+etc.), before sending an event to handlers. This means that events which have
+been generated by descendant loggers will not be filtered by a logger’s filter
+setting, unless the filter has also been applied to those descendant loggers.
+
You don’t actually need to subclass Filter: you can pass any instance
+which has a filter method with the same semantics.
+
+
Changed in version 3.2: You don’t need to create specialized Filter classes, or use other
+classes with a filter method: you can use a function (or other
+callable) as a filter. The filtering logic will check to see if the filter
+object has a filter attribute: if it does, it’s assumed to be a
+Filter and its filter() method is called. Otherwise, it’s
+assumed to be a callable and called with the record as the single
+parameter. The returned value should conform to that returned by
+filter().
+
+
Although filters are used primarily to filter records based on more
+sophisticated criteria than levels, they get to see every record which is
+processed by the handler or logger they’re attached to: this can be useful if
+you want to do things like counting how many records were processed by a
+particular logger or handler, or adding, changing or removing attributes in
+the LogRecord being processed. Obviously changing the LogRecord needs
+to be done with some care, but it does allow the injection of contextual
+information into logs (see Using Filters to impart contextual information).
LogRecord instances are created automatically by the Logger
+every time something is logged, and can be created manually via
+makeLogRecord() (for example, from a pickled event received over the
+wire).
Contains all the information pertinent to the event being logged.
+
The primary information is passed in msg and args,
+which are combined using msg%args to create
+the message attribute of the record.
+
+
Parameters
+
+
name (str) – The name of the logger used to log the event
+represented by this LogRecord.
+Note that the logger name in the LogRecord
+will always have this value,
+even though it may be emitted by a handler
+attached to a different (ancestor) logger.
+
level (int) – The numeric level of the logging event
+(such as 10 for DEBUG, 20 for INFO, etc).
+Note that this is converted to two attributes of the LogRecord:
+levelno for the numeric value
+and levelname for the corresponding level name.
+
pathname (str) – The full string path of the source file
+where the logging call was made.
+
lineno (int) – The line number in the source file
+where the logging call was made.
+
msg (Any) – The event description message,
+which can be a %-format string with placeholders for variable data,
+or an arbitrary object (see Using arbitrary objects as messages).
+
args (tuple | dict[str, Any]) – Variable data to merge into the msg argument
+to obtain the event description.
Returns the message for this LogRecord instance after merging any
+user-supplied arguments with the message. If the user-supplied message
+argument to the logging call is not a string, str() is called on it to
+convert it to a string. This allows use of user-defined classes as
+messages, whose __str__ method can return the actual format string to
+be used.
+
+
+
+
Changed in version 3.2: The creation of a LogRecord has been made more configurable by
+providing a factory which is used to create the record. The factory can be
+set using getLogRecordFactory() and setLogRecordFactory()
+(see this for the factory’s signature).
+
+
This functionality can be used to inject your own values into a
+LogRecord at creation time. You can use the following pattern:
With this pattern, multiple factories could be chained, and as long
+as they don’t overwrite each other’s attributes or unintentionally
+overwrite the standard attributes listed above, there should be no
+surprises.
The LogRecord has a number of attributes, most of which are derived from the
+parameters to the constructor. (Note that the names do not always correspond
+exactly between the LogRecord constructor parameters and the LogRecord
+attributes.) These attributes can be used to merge data from the record into
+the format string. The following table lists (in alphabetical order) the
+attribute names, their meanings and the corresponding placeholder in a %-style
+format string.
+
If you are using {}-formatting (str.format()), you can use
+{attrname} as the placeholder in the format string. If you are using
+$-formatting (string.Template), use the form ${attrname}. In
+both cases, of course, replace attrname with the actual attribute name
+you want to use.
+
In the case of {}-formatting, you can specify formatting flags by placing them
+after the attribute name, separated from it with a colon. For example: a
+placeholder of {msecs:03d} would format a millisecond value of 4 as
+004. Refer to the str.format() documentation for full details on
+the options available to you.
+
+
+
+
+
+
+
+
Attribute name
+
Format
+
Description
+
+
+
+
args
+
You shouldn’t need to
+format this yourself.
+
The tuple of arguments merged into msg to
+produce message, or a dict whose values
+are used for the merge (when there is only one
+argument, and it is a dictionary).
+
+
asctime
+
%(asctime)s
+
Human-readable time when the
+LogRecord was created. By default
+this is of the form ‘2003-07-08 16:49:45,896’
+(the numbers after the comma are millisecond
+portion of the time).
Source line number where the logging call was
+issued (if available).
+
+
message
+
%(message)s
+
The logged message, computed as msg%
+args. This is set when
+Formatter.format() is invoked.
+
+
module
+
%(module)s
+
Module (name portion of filename).
+
+
msecs
+
%(msecs)d
+
Millisecond portion of the time when the
+LogRecord was created.
+
+
msg
+
You shouldn’t need to
+format this yourself.
+
The format string passed in the original
+logging call. Merged with args to
+produce message, or an arbitrary object
+(see Using arbitrary objects as messages).
+
+
name
+
%(name)s
+
Name of the logger used to log the call.
+
+
pathname
+
%(pathname)s
+
Full pathname of the source file where the
+logging call was issued (if available).
+
+
process
+
%(process)d
+
Process ID (if available).
+
+
processName
+
%(processName)s
+
Process name (if available).
+
+
relativeCreated
+
%(relativeCreated)d
+
Time in milliseconds when the LogRecord was
+created, relative to the time the logging
+module was loaded.
+
+
stack_info
+
You shouldn’t need to
+format this yourself.
+
Stack frame information (where available)
+from the bottom of the stack in the current
+thread, up to and including the stack frame
+of the logging call which resulted in the
+creation of this record.
Modifies the message and/or keyword arguments passed to a logging call in
+order to insert contextual information. This implementation takes the object
+passed as extra to the constructor and adds it to kwargs using key
+‘extra’. The return value is a (msg, kwargs) tuple which has the
+(possibly modified) versions of the arguments passed in.
The logging module is intended to be thread-safe without any special work
+needing to be done by its clients. It achieves this though using threading
+locks; there is one lock to serialize access to the module’s shared data, and
+each handler also creates a lock to serialize access to its underlying I/O.
+
If you are implementing asynchronous signal handlers using the signal
+module, you may not be able to use logging from within such handlers. This is
+because lock implementations in the threading module are not always
+re-entrant, and so cannot be invoked from such signal handlers.
Return a logger with the specified name or, if name is None, return a
+logger which is the root logger of the hierarchy. If specified, the name is
+typically a dot-separated hierarchical name like ‘a’, ‘a.b’ or ‘a.b.c.d’.
+Choice of these names is entirely up to the developer who is using logging.
+
All calls to this function with a given name return the same logger instance.
+This means that logger instances never need to be passed between different parts
+of an application.
Return either the standard Logger class, or the last class passed to
+setLoggerClass(). This function may be called from within a new class
+definition, to ensure that installing a customized Logger class will
+not undo customizations already applied by other code. For example:
+
classMyLogger(logging.getLoggerClass()):
+ # ... override behaviour here
+
Return a callable which is used to create a LogRecord.
+
+
New in version 3.2: This function has been provided, along with setLogRecordFactory(),
+to allow developers more control over how the LogRecord
+representing a logging event is constructed.
+
+
See setLogRecordFactory() for more information about the how the
+factory is called.
Logs a message with level DEBUG on the root logger. The msg is the
+message format string, and the args are the arguments which are merged into
+msg using the string formatting operator. (Note that this means that you can
+use keywords in the format string, together with a single dictionary argument.)
+
There are three keyword arguments in kwargs which are inspected: exc_info
+which, if it does not evaluate as false, causes exception information to be
+added to the logging message. If an exception tuple (in the format returned by
+sys.exc_info()) or an exception instance is provided, it is used;
+otherwise, sys.exc_info() is called to get the exception information.
+
The second optional keyword argument is stack_info, which defaults to
+False. If true, stack information is added to the logging
+message, including the actual logging call. Note that this is not the same
+stack information as that displayed through specifying exc_info: The
+former is stack frames from the bottom of the stack up to the logging call
+in the current thread, whereas the latter is information about stack frames
+which have been unwound, following an exception, while searching for
+exception handlers.
+
You can specify stack_info independently of exc_info, e.g. to just show
+how you got to a certain point in your code, even when no exceptions were
+raised. The stack frames are printed following a header line which says:
+
Stack (most recent call last):
+
+
+
This mimics the Traceback(mostrecentcalllast): which is used when
+displaying exception frames.
+
The third optional keyword argument is extra which can be used to pass a
+dictionary which is used to populate the __dict__ of the LogRecord created for
+the logging event with user-defined attributes. These custom attributes can then
+be used as you like. For example, they could be incorporated into logged
+messages. For example:
The keys in the dictionary passed in extra should not clash with the keys used
+by the logging system. (See the Formatter documentation for more
+information on which keys are used by the logging system.)
+
If you choose to use these attributes in logged messages, you need to exercise
+some care. In the above example, for instance, the Formatter has been
+set up with a format string which expects ‘clientip’ and ‘user’ in the attribute
+dictionary of the LogRecord. If these are missing, the message will not be
+logged because a string formatting exception will occur. So in this case, you
+always need to pass the extra dictionary with these keys.
+
While this might be annoying, this feature is intended for use in specialized
+circumstances, such as multi-threaded servers where the same code executes in
+many contexts, and interesting conditions which arise are dependent on this
+context (such as remote client IP address and authenticated user name, in the
+above example). In such circumstances, it is likely that specialized
+Formatters would be used with particular Handlers.
Logs a message with level ERROR on the root logger. The arguments are
+interpreted as for debug(). Exception info is added to the logging
+message. This function should only be called from an exception handler.
Provides an overriding level level for all loggers which takes precedence over
+the logger’s own level. When the need arises to temporarily throttle logging
+output down across the whole application, this function can be useful. Its
+effect is to disable all logging calls of severity level and below, so that
+if you call it with a value of INFO, then all INFO and DEBUG events would be
+discarded, whereas those of severity WARNING and above would be processed
+according to the logger’s effective level. If
+logging.disable(logging.NOTSET) is called, it effectively removes this
+overriding level, so that logging output again depends on the effective
+levels of individual loggers.
+
Note that if you have defined any custom logging level higher than
+CRITICAL (this is not recommended), you won’t be able to rely on the
+default value for the level parameter, but will have to explicitly supply a
+suitable value.
+
+
Changed in version 3.7: The level parameter was defaulted to level CRITICAL. See
+bpo-28524 for more information about this change.
Associates level level with text levelName in an internal dictionary, which is
+used to map numeric levels to a textual representation, for example when a
+Formatter formats a message. This function can also be used to define
+your own levels. The only constraints are that all levels used must be
+registered using this function, levels should be positive integers and they
+should increase in increasing order of severity.
+
+
Note
+
If you are thinking of defining your own levels, please see the
+section on Custom Levels.
Returns a mapping from level names to their corresponding logging levels. For example, the
+string “CRITICAL” maps to CRITICAL. The returned mapping is copied from an internal
+mapping on each call to this function.
Returns the textual or numeric representation of logging level level.
+
If level is one of the predefined levels CRITICAL, ERROR,
+WARNING, INFO or DEBUG then you get the
+corresponding string. If you have associated levels with names using
+addLevelName() then the name you have associated with level is
+returned. If a numeric value corresponding to one of the defined levels is
+passed in, the corresponding string representation is returned.
+
The level parameter also accepts a string representation of the level such
+as ‘INFO’. In such cases, this functions returns the corresponding numeric
+value of the level.
+
If no matching numeric or string value is passed in, the string
+‘Level %s’ % level is returned.
+
+
Note
+
Levels are internally integers (as they need to be compared in the
+logging logic). This function is used to convert between an integer level
+and the level name displayed in the formatted log output by means of the
+%(levelname)s format specifier (see LogRecord attributes), and
+vice versa.
+
+
+
Changed in version 3.4: In Python versions earlier than 3.4, this function could also be passed a
+text level, and would return the corresponding numeric value of the level.
+This undocumented behaviour was considered a mistake, and was removed in
+Python 3.4, but reinstated in 3.4.2 due to retain backward compatibility.
Creates and returns a new LogRecord instance whose attributes are
+defined by attrdict. This function is useful for taking a pickled
+LogRecord attribute dictionary, sent over a socket, and reconstituting
+it as a LogRecord instance at the receiving end.
This function does nothing if the root logger already has handlers
+configured, unless the keyword argument force is set to True.
+
+
Note
+
This function should be called from the main thread
+before other threads are started. In versions of Python prior to
+2.7.1 and 3.2, if this function is called from multiple threads,
+it is possible (in rare circumstances) that a handler will be added
+to the root logger more than once, leading to unexpected results
+such as messages being duplicated in the log.
+
+
The following keyword arguments are supported.
+
+
+
+
+
+
+
Format
+
Description
+
+
+
+
filename
+
Specifies that a FileHandler be
+created, using the specified filename,
+rather than a StreamHandler.
+
+
filemode
+
If filename is specified, open the file
+in this mode. Defaults
+to 'a'.
+
+
format
+
Use the specified format string for the
+handler. Defaults to attributes
+levelname, name and message
+separated by colons.
+
+
datefmt
+
Use the specified date/time format, as
+accepted by time.strftime().
+
+
style
+
If format is specified, use this style
+for the format string. One of '%',
+'{' or '$' for printf-style,
+str.format() or
+string.Template respectively.
+Defaults to '%'.
+
+
level
+
Set the root logger level to the specified
+level.
+
+
stream
+
Use the specified stream to initialize the
+StreamHandler. Note that this
+argument is incompatible with filename -
+if both are present, a ValueError is
+raised.
+
+
handlers
+
If specified, this should be an iterable of
+already created handlers to add to the root
+logger. Any handlers which don’t already
+have a formatter set will be assigned the
+default formatter created in this function.
+Note that this argument is incompatible
+with filename or stream - if both
+are present, a ValueError is raised.
+
+
force
+
If this keyword argument is specified as
+true, any existing handlers attached to the
+root logger are removed and closed, before
+carrying out the configuration as specified
+by the other arguments.
+
+
encoding
+
If this keyword argument is specified along
+with filename, its value is used when the
+FileHandler is created, and thus
+used when opening the output file.
+
+
errors
+
If this keyword argument is specified along
+with filename, its value is used when the
+FileHandler is created, and thus
+used when opening the output file. If not
+specified, the value ‘backslashreplace’ is
+used. Note that if None is specified,
+it will be passed as such to open(),
+which means that it will be treated the
+same as passing ‘errors’.
+
+
+
+
+
Changed in version 3.2: The style argument was added.
+
+
+
Changed in version 3.3: The handlers argument was added. Additional checks were added to
+catch situations where incompatible arguments are specified (e.g.
+handlers together with stream or filename, or stream
+together with filename).
+
+
+
Changed in version 3.8: The force argument was added.
+
+
+
Changed in version 3.9: The encoding and errors arguments were added.
Informs the logging system to perform an orderly shutdown by flushing and
+closing all handlers. This should be called at application exit and no
+further use of the logging system should be made after this call.
+
When the logging module is imported, it registers this function as an exit
+handler (see atexit), so normally there’s no need to do that
+manually.
Tells the logging system to use the class klass when instantiating a logger.
+The class should define __init__() such that only a name argument is
+required, and the __init__() should call Logger.__init__(). This
+function is typically called before any loggers are instantiated by applications
+which need to use custom logger behavior. After this call, as at any other
+time, do not instantiate loggers directly using the subclass: continue to use
+the logging.getLogger() API to get your loggers.
Set a callable which is used to create a LogRecord.
+
+
Parameters
+
factory – The factory callable to be used to instantiate a log record.
+
+
+
+
New in version 3.2: This function has been provided, along with getLogRecordFactory(), to
+allow developers more control over how the LogRecord representing
+a logging event is constructed.
A “handler of last resort” is available through this attribute. This
+is a StreamHandler writing to sys.stderr with a level of
+WARNING, and is used to handle logging events in the absence of any
+logging configuration. The end result is to just print the message to
+sys.stderr. This replaces the earlier error message saying that
+“no handlers could be found for logger XYZ”. If you need the earlier
+behaviour for some reason, lastResort can be set to None.
This function is used to turn the capture of warnings by logging on and
+off.
+
If capture is True, warnings issued by the warnings module will
+be redirected to the logging system. Specifically, a warning will be
+formatted using warnings.formatwarning() and the resulting string
+logged to a logger named 'py.warnings' with a severity of WARNING.
+
If capture is False, the redirection of warnings to the logging system
+will stop, and warnings will be redirected to their original destinations
+(i.e. those in effect before captureWarnings(True) was called).
This is the original source for the logging package. The version of the
+package available from this site is suitable for use with Python 1.5.2, 2.1.x
+and 2.2.x, which do not include the logging package in the standard
+library.