| from mediawiki_dump.tokenizer import WikiTokenizer | |
| with open( | |
| "wiki_train/wikipedia_test_5files_parallel/enwiki-latest-pages-articles1.xml-p1p41242.bz2", | |
| "rb", | |
| ) as dump: | |
| tokenizer = WikiTokenizer(dump) | |
| for token in tokenizer: | |
| if token.type == "text": | |
| print(token.value) | |