diff --git a/CMakeLists.txt b/CMakeLists.txt index f908fd9..2afc6f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,7 +70,10 @@ endfunction() build_exec(main src/main.cpp) # build_exec(main_qwen src/main_qwen.cpp) -file(GLOB RUN_SCRIPT "${CMAKE_SOURCE_DIR}/scripts/*") +file(GLOB RUN_SCRIPT "${CMAKE_SOURCE_DIR}/scripts/*.py" "${CMAKE_SOURCE_DIR}/scripts/*.sh") install(FILES ${RUN_SCRIPT} DESTINATION bin/) +file(GLOB LLAMA3_TOKENIZER "${CMAKE_SOURCE_DIR}/scripts/llama3_tokenizer/*") +install(FILES ${LLAMA3_TOKENIZER} DESTINATION bin/llama3_tokenizer/) + # add_executable(fp32_to_bf16 tools/fp32_to_bf16.cpp) diff --git a/scripts/run_phi3_mini.sh b/scripts/run_phi3_mini.sh new file mode 100644 index 0000000..c66c26c --- /dev/null +++ b/scripts/run_phi3_mini.sh @@ -0,0 +1,13 @@ +./main \ +--template_filename_axmodel "phi3-int8/llama_l%d.axmodel" \ +--axmodel_num 32 \ +--tokenizer_type 3 \ +--bos 1 --eos 0 \ +--filename_tokenizer_model tokenizer.model \ +--filename_post_axmodel phi3-int8/llama_post.axmodel \ +--filename_tokens_embed phi3-int8/model.embed_tokens.weight.bfloat16.bin \ +--tokens_embed_num 32064 \ +--tokens_embed_size 3072 \ +--live_print 1 \ +--continue 1 \ +--prompt "$1" diff --git a/src/main.cpp b/src/main.cpp index d6503fc..647a031 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -27,6 +27,9 @@ std::string prompt_complete(std::string prompt, TokenizerType tokenizer_type) oss_prompt << "<|user|>\n" << prompt << "<|assistant|>\n"; break; + case TKT_Phi3: + oss_prompt << prompt << " "; + break; case TKT_Qwen: oss_prompt << "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"; oss_prompt << "\n<|im_start|>user\n" @@ -52,7 +55,7 @@ int main(int argc, char *argv[]) cmd.add("prompt", 'p', "prompt", true, prompt); cmd.add("template_filename_axmodel", 0, "axmodel path template", false, attr.template_filename_axmodel); cmd.add("filename_post_axmodel", 0, "post axmodel path", false, attr.filename_post_axmodel); - cmd.add("tokenizer_type", 0, "tokenizer type 0:LLaMa 1:Qwen 2:HTTP", false, attr.tokenizer_type); + cmd.add("tokenizer_type", 0, "tokenizer type 0:LLaMa 1:Qwen 2:HTTP 3:Phi3", false, attr.tokenizer_type); cmd.add("filename_tokenizer_model", 0, "tokenizer model path", false, attr.filename_tokenizer_model); cmd.add("filename_tokens_embed", 0, "tokens embed path", false, attr.filename_tokens_embed); diff --git a/src/runner/LLM.hpp b/src/runner/LLM.hpp index f5e6668..ac40054 100644 --- a/src/runner/LLM.hpp +++ b/src/runner/LLM.hpp @@ -200,7 +200,7 @@ class LLM return true; } - LLMAttrType* getAttr() + LLMAttrType *getAttr() { return &_attr; } @@ -239,6 +239,7 @@ class LLM int len_of_input = token_ids.size(); timer t_cost; // print token_ids + // printf("%s\n", input_str.c_str()); // for (size_t i = 0; i < token_ids.size(); i++) // { // printf("%d ", token_ids[i]); @@ -345,7 +346,22 @@ class LLM } } next_token = max_index; + + if (tokenizer->isEnd(max_index)) + { + if (cached_token.size()) + { + float t_cost_ms = t_cost.cost(); + float token_per_sec = token_ids.size() / (t_cost_ms / 1000); + auto tmp_out = tokenizer->Decode(cached_token); + _attr.runing_callback(cached_token.data(), cached_token.size(), tmp_out.c_str(), token_per_sec, _attr.reserve); + cached_token.clear(); + } + b_hit_eos = true; + break; + } token_ids.push_back(max_index); + if (_attr.runing_callback) { cached_token.push_back(max_index); @@ -358,12 +374,6 @@ class LLM cached_token.clear(); } } - - if (max_index == tokenizer->GetEosID()) - { - b_hit_eos = true; - break; - } } if (_attr.runing_callback == nullptr) update_cqdm(&cqdm, indices, "token", ""); diff --git a/src/runner/Tokenizer/Tokenizer.cpp b/src/runner/Tokenizer/Tokenizer.cpp index 980db2f..5b7b61f 100644 --- a/src/runner/Tokenizer/Tokenizer.cpp +++ b/src/runner/Tokenizer/Tokenizer.cpp @@ -88,6 +88,88 @@ class TokenizerLLaMa : public BaseTokenizer } }; +class TokenizerPhi3 : public BaseTokenizer +{ + sentencepiece::SentencePieceProcessor sp; + bool _b_bos, _b_eos; + +private: + /* data */ +public: + bool Init(std::string model_path, bool b_bos = true, bool b_eos = false) override + { + auto ret = sp.Load(model_path); + if (!ret.ok()) + { + ALOGE("%s", ret.error_message()); + return false; + } + + this->_b_bos = b_bos; + this->_b_eos = b_eos; + return ret.ok(); + } + + bool Encode(std::string input, std::vector &output) override + { + auto ret = sp.Encode(input, &output); + if (!ret.ok()) + { + ALOGE("%s", ret.error_message()); + return false; + } + output.insert(output.begin(), 32010); //"<|user|>" + output.push_back(32007); //"<|end|>" + output.push_back(32001); //"<|assistant|>" + if (_b_bos) + { + output.insert(output.begin(), sp.bos_id()); + } + if (_b_eos) + { + output.push_back(sp.eos_id()); + } + return true; + } + + std::vector Encode(std::string input) override + { + std::vector output; + Encode(input, output); + return output; + } + + std::string Decode(const std::vector input) override + { + sentencepiece::SentencePieceText spt; + sp.Decode(input, &spt); + std::string out = spt.pieces()[0].piece(); + if (*(unsigned short *)out.data() == 38626) + { + return " " + spt.text(); + } + else + { + return spt.text(); + } + } + + int GetBosID() override + { + return sp.bos_id(); + } + + int GetEosID() override + { + return 32007; + } + + bool isEnd(int id) override + { + return id == GetEosID() || id > 31999; + } +}; + class TokenizerQwen : public BaseTokenizer { std::shared_ptr sp; @@ -370,6 +452,8 @@ std::shared_ptr CreateTokenizer(TokenizerType type) return std::make_shared(); case TKT_Qwen: return std::make_shared(); + case TKT_Phi3: + return std::make_shared(); default: return nullptr; } diff --git a/src/runner/Tokenizer/Tokenizer.hpp b/src/runner/Tokenizer/Tokenizer.hpp index cc213e2..25ccbc0 100644 --- a/src/runner/Tokenizer/Tokenizer.hpp +++ b/src/runner/Tokenizer/Tokenizer.hpp @@ -8,6 +8,7 @@ enum TokenizerType TKT_LLaMa, TKT_Qwen, TKT_HTTP, + TKT_Phi3, TKT_END }; @@ -20,6 +21,8 @@ class BaseTokenizer virtual std::string Decode(const std::vector input) = 0; virtual int GetBosID() = 0; virtual int GetEosID() = 0; + + virtual bool isEnd(int id) { return id == GetEosID(); } }; std::shared_ptr CreateTokenizer(TokenizerType type); \ No newline at end of file