@@ -753,12 +753,7 @@ struct server_context {
753
753
metrics.init ();
754
754
}
755
755
756
- std::vector<llama_token> tokenize (const json & json_prompt, bool add_special) const {
757
- // TODO: currently, we tokenize using special tokens by default
758
- // this is not always correct (see https://github.com./ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
759
- // but it's better compared to completely ignoring ChatML and other chat templates
760
- const bool TMP_FORCE_SPECIAL = true ;
761
-
756
+ std::vector<llama_token> tokenize (const json & json_prompt, bool add_special, bool parse_special) const {
762
757
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
763
758
// or the first element of the json_prompt array is a string.
764
759
std::vector<llama_token> prompt_tokens;
@@ -771,10 +766,10 @@ struct server_context {
771
766
772
767
std::vector<llama_token> p;
773
768
if (first) {
774
- p = ::llama_tokenize (ctx, s, add_special, TMP_FORCE_SPECIAL );
769
+ p = ::llama_tokenize (ctx, s, add_special, parse_special );
775
770
first = false ;
776
771
} else {
777
- p = ::llama_tokenize (ctx, s, false , TMP_FORCE_SPECIAL );
772
+ p = ::llama_tokenize (ctx, s, false , parse_special );
778
773
}
779
774
780
775
prompt_tokens.insert (prompt_tokens.end (), p.begin (), p.end ());
@@ -788,7 +783,7 @@ struct server_context {
788
783
}
789
784
} else {
790
785
auto s = json_prompt.template get <std::string>();
791
- prompt_tokens = ::llama_tokenize (ctx, s, add_special, TMP_FORCE_SPECIAL );
786
+ prompt_tokens = ::llama_tokenize (ctx, s, add_special, parse_special );
792
787
}
793
788
794
789
return prompt_tokens;
@@ -1215,7 +1210,7 @@ struct server_context {
1215
1210
slot.params .n_predict , n_ctx_train);
1216
1211
}
1217
1212
1218
- SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: '%s'\n " , slot.n_decoded , slot.n_remaining , token_str.c_str ());
1213
+ SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n " , slot.n_decoded , slot.n_remaining , result. tok , token_str.c_str ());
1219
1214
1220
1215
return slot.has_next_token ; // continue
1221
1216
}
@@ -1483,9 +1478,8 @@ struct server_context {
1483
1478
if (prompt.is_string () || json_is_array_of_numbers (prompt)) {
1484
1479
data[" index" ] = 0 ;
1485
1480
create_task (data, false , nullptr );
1486
- }
1487
- // otherwise, it's a multiple-prompt task, we break it into smaller tasks
1488
- else if (prompt.is_array ()) {
1481
+ } else if (prompt.is_array ()) {
1482
+ // otherwise, it's a multiple-prompt task, we break it into smaller tasks
1489
1483
std::vector<json> prompts = prompt;
1490
1484
if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
1491
1485
// prompts[0] is the question
@@ -1510,9 +1504,8 @@ struct server_context {
1510
1504
}
1511
1505
}
1512
1506
}
1513
- }
1514
- // invalid case
1515
- else {
1507
+ } else {
1508
+ // invalid case
1516
1509
throw std::runtime_error (error_msg);
1517
1510
}
1518
1511
@@ -1971,70 +1964,69 @@ struct server_context {
1971
1964
slot.t_start_process_prompt = ggml_time_us ();
1972
1965
slot.t_start_generation = 0 ;
1973
1966
1974
- if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) {
1975
- const bool add_bos = llama_add_bos_token (model);
1976
- bool suff_rm_leading_spc = true ;
1977
- if (params.input_suffix .find_first_of (' ' ) == 0 && params.input_suffix .size () > 1 ) {
1978
- params.input_suffix .erase (0 , 1 );
1979
- suff_rm_leading_spc = false ;
1980
- }
1981
-
1982
- auto prefix_tokens = tokenize (slot.params .input_prefix , false );
1983
- auto suffix_tokens = tokenize (slot.params .input_suffix , false );
1984
-
1985
- const int space_token = 29871 ; // TODO: this should not be hardcoded
1986
- if (suff_rm_leading_spc && !suffix_tokens.empty () && suffix_tokens[0 ] == space_token) {
1987
- suffix_tokens.erase (suffix_tokens.begin ());
1988
- }
1989
-
1990
- prefix_tokens.insert (prefix_tokens.begin (), llama_token_prefix (model));
1991
- suffix_tokens.insert (suffix_tokens.begin (), llama_token_suffix (model));
1992
-
1993
- auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
1994
- auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
1995
- if (add_bos) {
1996
- embd_inp.insert (embd_inp.begin (), llama_token_bos (model));
1997
- }
1998
- embd_inp.insert (embd_inp.end (), embd_end.begin (), embd_end.end ());
1999
-
2000
- const llama_token middle_token = llama_token_middle (model);
2001
- if (middle_token >= 0 ) {
2002
- embd_inp.push_back (middle_token);
2003
- }
2004
-
2005
- prompt_tokens = embd_inp;
2006
- } else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
2007
- // require slot.prompt to be array of 2 strings
2008
- if (!slot.prompt .is_array () || slot.prompt .size () != 2 ) {
2009
- SLT_ERR (slot, " %s" , " invalid prompt for rerank task\n " );
2010
- slot.release ();
2011
- send_error (slot, " invalid prompt for rerank task" , ERROR_TYPE_INVALID_REQUEST);
2012
- continue ;
2013
- }
2014
-
2015
- // prompt: [BOS]query[EOS][SEP]doc[EOS]
2016
- prompt_tokens.clear ();
2017
- prompt_tokens.push_back (llama_token_bos (model));
2018
- {
2019
- const auto part = tokenize (slot.prompt [0 ], false );
2020
- prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
2021
- }
2022
- prompt_tokens.push_back (llama_token_eos (model));
2023
- prompt_tokens.push_back (llama_token_sep (model));
2024
- {
2025
- const auto part = tokenize (slot.prompt [1 ], false );
2026
- prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
2027
- }
2028
- prompt_tokens.push_back (llama_token_eos (model));
2029
- } else {
2030
- prompt_tokens = tokenize (slot.prompt , system_prompt.empty ()); // add BOS if there isn't system prompt
1967
+ switch (slot.cmpl_type ) {
1968
+ case SERVER_TASK_CMPL_TYPE_NORMAL:
1969
+ case SERVER_TASK_CMPL_TYPE_EMBEDDING:
1970
+ {
1971
+ prompt_tokens = tokenize (slot.prompt , system_prompt.empty (), true ); // add BOS if there isn't system prompt
1972
+ } break ;
1973
+ case SERVER_TASK_CMPL_TYPE_RERANK:
1974
+ {
1975
+ // require slot.prompt to be array of 2 strings
1976
+ if (!slot.prompt .is_array () || slot.prompt .size () != 2 ) {
1977
+ SLT_ERR (slot, " %s" , " invalid prompt for rerank task\n " );
1978
+ slot.release ();
1979
+ send_error (slot, " invalid prompt for rerank task" , ERROR_TYPE_INVALID_REQUEST);
1980
+ continue ;
1981
+ }
1982
+
1983
+ // prompt: [BOS]query[EOS][SEP]doc[EOS]
1984
+ prompt_tokens.clear ();
1985
+ prompt_tokens.push_back (llama_token_bos (model));
1986
+ {
1987
+ const auto part = tokenize (slot.prompt [0 ], false , false );
1988
+ prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
1989
+ }
1990
+ prompt_tokens.push_back (llama_token_eos (model));
1991
+ prompt_tokens.push_back (llama_token_sep (model));
1992
+ {
1993
+ const auto part = tokenize (slot.prompt [1 ], false , false );
1994
+ prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
1995
+ }
1996
+ prompt_tokens.push_back (llama_token_eos (model));
1997
+ } break ;
1998
+ case SERVER_TASK_CMPL_TYPE_INFILL:
1999
+ {
2000
+ auto prefix_tokens = tokenize (slot.params .input_prefix , false , false );
2001
+ auto suffix_tokens = tokenize (slot.params .input_suffix , false , false );
2002
+
2003
+ prefix_tokens.insert (prefix_tokens.begin (), llama_token_fim_pre (model));
2004
+ suffix_tokens.insert (suffix_tokens.begin (), llama_token_fim_suf (model));
2005
+
2006
+ auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
2007
+ auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
2008
+
2009
+ if (llama_add_bos_token (model)) {
2010
+ embd_inp.insert (embd_inp.begin (), llama_token_bos (model));
2011
+ }
2012
+
2013
+ embd_inp.insert (embd_inp.end (), embd_end.begin (), embd_end.end ());
2014
+ embd_inp.push_back (llama_token_fim_mid (model));
2015
+
2016
+ prompt_tokens = std::move (embd_inp);
2017
+ } break ;
2031
2018
}
2032
2019
2033
2020
slot.n_past = 0 ;
2034
2021
slot.n_prompt_tokens = prompt_tokens.size ();
2035
2022
2036
2023
SLT_INF (slot, " prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n " , slot.n_ctx , slot.params .n_keep , slot.n_prompt_tokens );
2037
2024
2025
+ // print prompt tokens:
2026
+ for (int i = 0 ; i < (int ) prompt_tokens.size (); i++) {
2027
+ SLT_DBG (slot, " prompt token %3d: %6d '%s'\n " , i, prompt_tokens[i], llama_token_to_piece (ctx, prompt_tokens[i]).c_str ());
2028
+ }
2029
+
2038
2030
// empty prompt passed -> release the slot and send empty response
2039
2031
if (prompt_tokens.empty ()) {
2040
2032
SLT_WRN (slot, " %s" , " empty prompt - releasing slot\n " );
@@ -2924,7 +2916,23 @@ int main(int argc, char ** argv) {
2924
2916
return handle_completions_generic (SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
2925
2917
};
2926
2918
2927
- const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
2919
+ const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
2920
+ std::string err;
2921
+ if (llama_token_fim_pre (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2922
+ err += " prefix token is missing. " ;
2923
+ }
2924
+ if (llama_token_fim_suf (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2925
+ err += " suffix token is missing. " ;
2926
+ }
2927
+ if (llama_token_fim_mid (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2928
+ err += " middle token is missing. " ;
2929
+ }
2930
+
2931
+ if (!err.empty ()) {
2932
+ res_error (res, format_error_response (string_format (" Infill is not supported by this model: %s" , err.c_str ()), ERROR_TYPE_NOT_SUPPORTED));
2933
+ return ;
2934
+ }
2935
+
2928
2936
json data = json::parse (req.body );
2929
2937
return handle_completions_generic (SERVER_TASK_CMPL_TYPE_INFILL, data, res);
2930
2938
};
@@ -3010,7 +3018,8 @@ int main(int argc, char ** argv) {
3010
3018
if (body.count (" content" ) != 0 ) {
3011
3019
const bool add_special = json_value (body, " add_special" , false );
3012
3020
const bool with_pieces = json_value (body, " with_pieces" , false );
3013
- std::vector<llama_token> tokens = ctx_server.tokenize (body.at (" content" ), add_special);
3021
+
3022
+ std::vector<llama_token> tokens = ctx_server.tokenize (body.at (" content" ), add_special, true );
3014
3023
3015
3024
if (with_pieces) {
3016
3025
for (const auto & token : tokens) {
0 commit comments