Pre-train BERT (Chinese language model) from scratch
1 2 import transformers,tokenizerstransformers.__version__,tokenizers.__version__
('4.24.0', '0.13.2')
1 2 3 4 5 6 import tokenizersfrom transformers import BertTokenizer, LineByLineTextDatasetfrom transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModelingfrom transformers import Trainer, TrainingArgumentsfrom transformers import pipeline
1 2 3 4 5 6 7 8 9 10 11 12 13 14 bwpt = tokenizers.BertWordPieceTokenizer(vocab_file=None ) filepath = "./train.txt" bwpt.train( files=[filepath], vocab_size=50000 , min_frequency=3 , limit_alphabet=1000 )
1 bwpt.save('./训练中文的bert输出/' , 'name' )
['./训练中文的bert输出/name-vocab.txt']
1 2 3 4 5 6 7 8 9 10 11 12 vocab_file_dir = './vocab.txt' tokenizer = BertTokenizer.from_pretrained(vocab_file_dir) sentence = '今天晚上我要吃啵啵鱼' encoded_input = tokenizer.tokenize(sentence) print (encoded_input)
['今', '天', '晚', '上', '我', '要', '吃', '啵', '啵', '鱼']
C:\Users\dupeibo\Anaconda3\envs\pt\lib\site-packages\transformers\tokenization_utils_base.py:1679: FutureWarning: Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated and won't be possible anymore in v5. Use a model identifier or the path to a directory instead.
warnings.warn(
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 %%time ''' transformers has a predefined class LineByLineTextDataset() which reads your text line by line and converts them to tokens ''' dataset= LineByLineTextDataset( tokenizer = tokenizer, file_path = './train.txt' , block_size = 128 ) print ('No. of lines: ' , len (dataset)) dataset
No. of lines: 24494
Wall time: 10.4 s
<transformers.data.datasets.language_modeling.LineByLineTextDataset at 0x2084eb42ac8>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 config = BertConfig( vocab_size=50000 , hidden_size=768 , num_hidden_layers=6 , num_attention_heads=12 , max_position_embeddings=512 ) model = BertForMaskedLM(config) print ('No of parameters: ' , model.num_parameters())data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True , mlm_probability=0.15 )
No of parameters: 82556240
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 training_args = TrainingArguments( output_dir='./训练中文的bert输出/' , overwrite_output_dir=True , num_train_epochs=1 , per_device_train_batch_size=4 , save_steps=10_000 , save_total_limit=2 , ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True , )
Epoch: 0%| | 0/1 [00:00<?, ?it/s]
Iteration: 0%| | 0/6124 [00:00<?, ?it/s]
{"loss": 7.0956006441116335, "learning_rate": 4.591770084911823e-05, "epoch": 0.08164598301763554, "step": 500}
{"loss": 6.214028715133667, "learning_rate": 4.183540169823645e-05, "epoch": 0.16329196603527107, "step": 1000}
{"loss": 5.860672056674957, "learning_rate": 3.775310254735467e-05, "epoch": 0.24493794905290658, "step": 1500}
{"loss": 5.599938702583313, "learning_rate": 3.367080339647289e-05, "epoch": 0.32658393207054215, "step": 2000}
{"loss": 5.412256263256073, "learning_rate": 2.958850424559112e-05, "epoch": 0.4082299150881777, "step": 2500}
{"loss": 5.261007954120636, "learning_rate": 2.550620509470934e-05, "epoch": 0.48987589810581317, "step": 3000}
{"loss": 5.095327672958374, "learning_rate": 2.1423905943827566e-05, "epoch": 0.5715218811234487, "step": 3500}
{"loss": NaN, "learning_rate": 1.734160679294579e-05, "epoch": 0.6531678641410843, "step": 4000}
{"loss": NaN, "learning_rate": 1.3259307642064011e-05, "epoch": 0.7348138471587198, "step": 4500}
{"loss": NaN, "learning_rate": 9.177008491182235e-06, "epoch": 0.8164598301763554, "step": 5000}
{"loss": NaN, "learning_rate": 5.094709340300458e-06, "epoch": 0.8981058131939909, "step": 5500}
{"loss": NaN, "learning_rate": 1.0124101894186806e-06, "epoch": 0.9797517962116263, "step": 6000}
Wall time: 17min 31s
TrainOutput(global_step=6124, training_loss=nan)
1 2 3 4 5 6 7 8 9 model = BertForMaskedLM.from_pretrained('./' ) fill_mask = pipeline( "fill-mask" , model=model, tokenizer=tokenizer )
[{'score': 0.3431047201156616,
'token': 5552,
'token_str': '脏',
'sequence': '心 脏 病'},
{'score': 0.07011183351278305,
'token': 2552,
'token_str': '心',
'sequence': '心 心 病'},
{'score': 0.05838495120406151,
'token': 4567,
'token_str': '病',
'sequence': '心 病 病'},
{'score': 0.014283978380262852,
'token': 107,
'token_str': '"',
'sequence': '心 " 病'},
{'score': 0.011550793424248695,
'token': 7315,
'token_str': '闷',
'sequence': '心 闷 病'}]
[{'score': 0.4942161738872528,
'token': 2097,
'token_str': '孕',
'sequence': '怀 孕'},
{'score': 0.050573259592056274,
'token': 2577,
'token_str': '怀',
'sequence': '怀 怀'},
{'score': 0.01493070088326931,
'token': 107,
'token_str': '"',
'sequence': '怀 "'},
{'score': 0.010810167528688908,
'token': 5307,
'token_str': '经',
'sequence': '怀 经'},
{'score': 0.00741073302924633,
'token': 1453,
'token_str': '周',
'sequence': '怀 周'}]