Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
fbff5d3
add kv cache memory manager
yuanheng-zhao Aug 23, 2023
2d55ace
add stateinfo during inference
yuanheng-zhao Aug 23, 2023
e55e565
add
CjhHa1 Aug 22, 2023
a971535
add infer example
CjhHa1 Aug 23, 2023
0ae5bb7
finish
CjhHa1 Aug 23, 2023
a8f7386
finish
CjhHa1 Aug 23, 2023
cb45cf8
format
yuanheng-zhao Aug 23, 2023
4f21bc5
format
yuanheng-zhao Aug 23, 2023
389d0d4
rename file
yuanheng-zhao Aug 23, 2023
bdba1b5
add kv cache test
yuanheng-zhao Aug 24, 2023
813e23a
revise on BatchInferState
yuanheng-zhao Aug 24, 2023
5b08d60
Merge commit 'refs/pull/4495/head' of https://github.com/hpcaitech/Co…
isky-cd Aug 24, 2023
469a3c5
add inference test for llama
isky-cd Aug 24, 2023
5993a0f
fix conflict
isky-cd Aug 24, 2023
a98000f
fix conflict
isky-cd Aug 24, 2023
7686c07
fix conflict
isky-cd Aug 24, 2023
ba089d7
feature: add some new features for llama engine
isky-cd Aug 24, 2023
68b5fe8
adapt colossalai triton interface
isky-cd Aug 24, 2023
6021b13
Change the parent class of llama policy
isky-cd Aug 24, 2023
6a1bafa
add nvtx
isky-cd Aug 25, 2023
f79308e
move llama inference code to tensor_parallel
isky-cd Aug 27, 2023
a6cc3dd
Merge branch 'feature/colossal-inference' of https://github.com/hpcai…
isky-cd Aug 28, 2023
2a6a380
fix __init__.py
isky-cd Aug 28, 2023
d10dcf4
rm tensor_parallel
isky-cd Aug 28, 2023
fb2603b
fix: fix bugs in auto_policy.py
isky-cd Aug 28, 2023
92fd955
fix:rm some unused codes
isky-cd Aug 28, 2023
c747249
mv colossalai/tpinference to colossalai/inference/tensor_parallel
isky-cd Aug 28, 2023
8507fc5
Merge branch 'feature/colossal-inference' into llama_test_branch
isky-cd Aug 30, 2023
c27088f
change __init__.py
isky-cd Aug 30, 2023
af16040
save change
isky-cd Aug 30, 2023
bfc55cc
fix conflict
isky-cd Aug 30, 2023
f30f542
fix engine
isky-cd Aug 30, 2023
4b52ebd
Bug fix: Fix hang
isky-cd Aug 30, 2023
6d06421
remove llama_infer_engine.py
isky-cd Aug 30, 2023
1693198
bug fix: fix bugs about infer_state.is_context_stage
isky-cd Aug 30, 2023
8578e8c
fix conflict
isky-cd Aug 30, 2023
62bfb70
remove pollcies
isky-cd Aug 30, 2023
44e8606
fix: delete unused code
isky-cd Aug 30, 2023
ce26507
fix: delete unused code
isky-cd Aug 30, 2023
2122c67
fix conflict
isky-cd Aug 30, 2023
b3c43bf
remove unused coda
isky-cd Aug 30, 2023
c4dbd41
fix conflict
isky-cd Aug 30, 2023
ce5d0a0
fix conflict
isky-cd Aug 30, 2023
3c42dc2
fix conflict
isky-cd Aug 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion colossalai/inference/tensor_parallel/modeling/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def llama_model_forward(
infer_state.context_mem_index = infer_state.cache_manager.alloc(infer_state.total_token_num)
infer_state.init_block_loc(infer_state.block_loc, infer_state.seq_len, seq_length, infer_state.context_mem_index)
else:
# TODO handle the condition that no contiguous memory presents
infer_state.is_context_stage = False
alloc_mem = infer_state.cache_manager.alloc_contiguous(batch_size)
if alloc_mem is not None:
infer_state.decode_is_contiguous = True
Expand Down
14 changes: 8 additions & 6 deletions tests/test_infer/test_llama_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@

os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
TPSIZE = 2
BATCH_SIZE = 8
MAX_INPUT_LEN = 12
MAX_OUTPUT_LEN = 100

def init_to_get_rotary(self, base=10000):
self.config.head_dim_ = self.config.hidden_size // self.config.num_attention_heads
Expand Down Expand Up @@ -48,21 +51,20 @@ def run_llama_test(test_config):
model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id)
init_to_get_rotary(model.model, base=10000)
model = model.half()
model.to(torch.cuda.current_device())

text = "Introduce some landmarks in Beijing"
input_ids = tokenizer.encode(text, return_tensors='pt')
# pg_mesh = ProcessGroupMesh(1, 1, test_config["tp_size"])
text = "how is weather today?"
input_ids = tokenizer.encode(text, return_tensors='pt', device='cuda')

infer_engine = TPInferEngine(model.half(), 4, 12, 8)
infer_engine = TPInferEngine(model.half(), BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
shard_config = ShardConfig(enable_tensor_parallelism=True, inference_only=True)
shardformer = ShardFormer(shard_config=shard_config)

infer_engine.prepare_with_shard_config(shard_config)
infer_engine.shard_model_by(shardformer)

generate_kwargs = dict(do_sample=False)
generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
outputs = infer_engine.generate(input_ids, generate_kwargs)
print("outputs.shape: ", outputs.shape)

print("outputs: ", outputs)

Expand Down