字级别分词,不要用官方的tokenizer (https://github.com/google-research/bert/blob/master/tokenization.py)
自己重写一个
def tokenize_to_str_list(textString):
split_tokens = []
for i in range(len(textString)):
split_tokens.append(textString[i])
return split_tokens
def convert_to_int_list(split_tokens):
output = []
for token in split_tokens:
if token in char2id:
output.append(char2id[item])
return