Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revert changes to llama #1553

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 0 additions & 135 deletions docker/run-bind.sh

This file was deleted.

60 changes: 60 additions & 0 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,50 @@ void LLAMA::create_llama_model(FFModel &ff,

Tensor mha;
switch (mode) {
case BEAM_SEARCH_MODE: {
mha = ff.spec_inc_multiquery_self_attention(
qkv_proj,
llama_config.hidden_size,
llama_config.num_attention_heads,
llama_config.num_key_value_heads,
llama_config.hidden_size / llama_config.num_attention_heads,
llama_config.hidden_size / llama_config.num_attention_heads,
0.0f, /*dropout*/
false, /*add_zero_attn*/
DT_NONE, /*data_type*/
NULL, /*kernel_initializer*/
llama_config.rotary_embedding_meta,
false, /*scaling query*/
1.0f, /*scaling factor*/
true, /*qk_prod_scaling*/
false, /*position_bias*/
std::string("layers." + std::to_string(i) + ".self_attn")
.c_str() /*name*/
);
break;
}
case TREE_VERIFY_MODE: {
mha = ff.inc_multiquery_self_attention_verify(
qkv_proj,
llama_config.hidden_size,
llama_config.num_attention_heads,
llama_config.num_key_value_heads,
llama_config.hidden_size / llama_config.num_attention_heads,
llama_config.hidden_size / llama_config.num_attention_heads,
0.0f, /*dropout*/
false, /*add_zero_attn*/
DT_NONE, /*data_type*/
nullptr, /*kernel_initializer*/
llama_config.rotary_embedding_meta,
false, /*scaling query*/
1.0f, /*scaling factor*/
true, /*qk_prod_scaling*/
false, /*position_bias*/
std::string("layers." + std::to_string(i) + ".self_attn")
.c_str() /*name*/
);
break;
}
case INC_DECODING_MODE: {
mha = ff.inc_multiquery_self_attention(
qkv_proj,
Expand Down Expand Up @@ -231,6 +275,13 @@ void LLAMA::create_llama_model(FFModel &ff,
"lm_head");

Tensor output;
if (mode == BEAM_SEARCH_MODE) {
Tensor softmax = ff.softmax(dense, -1);
// output = ff.beam_top_k(softmax, llama_config.max_beam_width, false);
// output = ff.argmax(softmax, /*beam_Search*/ true);
output = ff.arg_top_k(softmax, llama_config.max_beam_width, false, true);
// output = ff.top_k(softmax, )
} else {
// Tensor softmax = ff.softmax(dense, -1);
if (generation_config.do_sample) {
dense = ff.scalar_truediv(dense, generation_config.temperature, false);
Expand All @@ -241,6 +292,15 @@ void LLAMA::create_llama_model(FFModel &ff,
Tensor softmax = ff.softmax(dense, -1);
output = ff.argmax(softmax, /*beam_Search*/ false);
}
}

// If PEFT is enabled, add LoRA layers
if (ff.config.enable_peft) {
// todo: add attention projections
std::vector<std::string> target_modules = {
"gate_proj", "up_proj", "down_proj"};
ff.add_lora_layers(target_modules);
}

FileDataLoader *fileloader = new FileDataLoader(
"",
Expand Down
44 changes: 44 additions & 0 deletions inference/models/mixtral.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,50 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,

Tensor mha;
switch (mode) {
case BEAM_SEARCH_MODE: {
mha = ff.spec_inc_multiquery_self_attention(
qkv_proj,
mixtral_config.hidden_size,
mixtral_config.num_attention_heads,
mixtral_config.num_key_value_heads,
mixtral_config.hidden_size / mixtral_config.num_attention_heads,
mixtral_config.hidden_size / mixtral_config.num_attention_heads,
0.0f, /*dropout*/
false, /*add_zero_attn*/
DT_NONE, /*data_type*/
NULL, /*kernel_initializer*/
mixtral_config.rotary_embedding_meta,
false, /*scaling query*/
1.0f, /*scaling factor*/
true, /*qk_prod_scaling*/
false, /*position_bias*/
std::string("layers." + std::to_string(i) + ".self_attn")
.c_str() /*name*/
);
break;
}
case TREE_VERIFY_MODE: {
mha = ff.inc_multiquery_self_attention_verify(
qkv_proj,
mixtral_config.hidden_size,
mixtral_config.num_attention_heads,
mixtral_config.num_key_value_heads,
mixtral_config.hidden_size / mixtral_config.num_attention_heads,
mixtral_config.hidden_size / mixtral_config.num_attention_heads,
0.0f, /*dropout*/
false, /*add_zero_attn*/
DT_NONE, /*data_type*/
nullptr, /*kernel_initializer*/
mixtral_config.rotary_embedding_meta,
false, /*scaling query*/
1.0f, /*scaling factor*/
true, /*qk_prod_scaling*/
false, /*position_bias*/
std::string("layers." + std::to_string(i) + ".self_attn")
.c_str() /*name*/
);
break;
}
case INC_DECODING_MODE: {
mha = ff.inc_multiquery_self_attention(
qkv_proj,
Expand Down
Loading