[1]:
from EduNLP.Tokenizer import PureTextTokenizer, TextTokenizer, get_tokenizer
D:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
TextTokenizer and PureTextTokenizer¶
‘text’ Tokenizer ignores and skips the FormulaFigures and tokenize latex Formulas as Text
‘pure_text’ Tokenizer symbolizes the FormulaFigures as [FUMULA] and tokenize latex Formulas as Text
TextTokenizer¶
[2]:
items = [{
"stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
"options": ["1", "2"]
}]
tokenizer = get_tokenizer("text") # tokenizer = TextTokenizer()
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))
['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '=']
[3]:
items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"]
[4]:
tokenizer = get_tokenizer("text") # tokenizer = TextTokenizer()
tokens = [t for t in tokenizer(items)]
tokens
[4]:
[['公式',
'[FORMULA]',
'如图',
'[FIGURE]',
'x',
',',
'y',
'约束条件',
'公式',
'[FORMULA]',
'[SEP]',
'z',
'=',
'x',
'+',
'7',
'y',
'最大值',
'[MARK]']]
PureTextTokenizer¶
[5]:
tokenizer = get_tokenizer("pure_text") # tokenizer = PureTextTokenizer()
tokens = [t for t in tokenizer(items)]
tokens
[5]:
[['公式',
'如图',
'[FIGURE]',
'x',
',',
'y',
'约束条件',
'公式',
'[SEP]',
'z',
'=',
'x',
'+',
'7',
'y',
'最大值',
'[MARK]']]