Skip to content

Commit 2df92cc

Browse files
mikepapadimCopilot
andauthored
Apply suggestion from @Copilot
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent c49558f commit 2df92cc

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

src/main/java/com/example/tokenizer/impl/Phi3Tokenizer.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,15 @@ public class Phi3Tokenizer implements Tokenizer {
3333
private final int[] tokenType;
3434
private final int byte0;
3535

36+
/** Number of base tokens in the vocabulary; tokens after this index are considered special. */
37+
private static final int BASE_TOKENS = 32000;
38+
3639
public Phi3Tokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
3740
int[] tokenTypes = (int[]) metadata.get("tokenizer.ggml.token_type");
3841
List<Pair<Integer, Integer>> merges = Collections.emptyList();
3942

4043
int allTokens = vocabulary.size();
41-
int baseTokens = 32000; // assume all tokens after the base ones are special.
44+
int baseTokens = BASE_TOKENS; // assume all tokens after the base ones are special.
4245
//int reservedSpecialTokens = allTokens - baseTokens;
4346
List<String> specialTokensList = Arrays.stream(vocabulary.tokens(), baseTokens, allTokens).toList();
4447

0 commit comments

Comments
 (0)