Skip to content

Commit

Permalink
tests : add fail test for llama-bpe
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed May 9, 2024
1 parent 8de8b6d commit 12a7b69
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
1 change: 1 addition & 0 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
"3333333",
"33333333",
"333333333",
# "Cửa Việt", # llama-bpe fails on this
chktxt,
]

Expand Down
14 changes: 7 additions & 7 deletions unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
std::unordered_map<uint32_t, int> cpt_types;
for (auto p : unicode_ranges_number) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
}
}
for (auto p : unicode_ranges_letter) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_LETTER;
}
}
for (auto p : unicode_ranges_separator) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
}
}
for (auto p : unicode_ranges_accent_mark) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
}
}
for (auto p : unicode_ranges_punctuation) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
}
}
Expand All @@ -142,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
}
}
for (auto p : unicode_ranges_control) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
}
}
Expand Down Expand Up @@ -629,7 +629,7 @@ bool unicode_cpt_is_whitespace(uint32_t cp) {
static const std::unordered_set<uint32_t> is_whitespace = [] {
std::unordered_set<uint32_t> is_whitespace;
for (auto p : unicode_ranges_whitespace) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
is_whitespace.insert(i);
}
}
Expand Down

0 comments on commit 12a7b69

Please sign in to comment.