江东的笔记

Be overcome difficulties is victory

0%

Pre-training

Pre-train BERT (Chinese language model) from scratch

1
2
import transformers,tokenizers
transformers.__version__,tokenizers.__version__
('4.24.0', '0.13.2')
1
2
3
4
5
6
import tokenizers
from transformers import BertTokenizer, LineByLineTextDataset
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Train a tokenizer

bwpt = tokenizers.BertWordPieceTokenizer(vocab_file=None)

# filepath = "../input/bert-bangla/raw_bangla_for_BERT.txt"
filepath = "./train.txt"

bwpt.train(
files=[filepath],
vocab_size=50000,
min_frequency=3,
limit_alphabet=1000
)

1
bwpt.save('./训练中文的bert输出/', 'name')
['./训练中文的bert输出/name-vocab.txt']
1
2
3
4
5
6
7
8
9
10
11
12
# Load the tokenizer

# vocab_file_dir = '/kaggle/input/bert-bangla/bangla-vocab.txt'
vocab_file_dir = './vocab.txt'

tokenizer = BertTokenizer.from_pretrained(vocab_file_dir)

sentence = '今天晚上我要吃啵啵鱼'

encoded_input = tokenizer.tokenize(sentence)
print(encoded_input)
# print(encoded_input['input_ids'])
['今', '天', '晚', '上', '我', '要', '吃', '啵', '啵', '鱼']


C:\Users\dupeibo\Anaconda3\envs\pt\lib\site-packages\transformers\tokenization_utils_base.py:1679: FutureWarning: Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated and won't be possible anymore in v5. Use a model identifier or the path to a directory instead.
  warnings.warn(
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
%%time

'''
transformers has a predefined class LineByLineTextDataset()
which reads your text line by line and converts them to tokens
'''

dataset= LineByLineTextDataset(
tokenizer = tokenizer,
# file_path = '/kaggle/input/bert-bangla/raw_bangla_for_BERT.txt',
file_path = './train.txt',
block_size = 128 # maximum sequence length
)

print('No. of lines: ', len(dataset)) # No of lines in your datset
dataset
No. of lines:  24494
Wall time: 10.4 s

<transformers.data.datasets.language_modeling.LineByLineTextDataset at 0x2084eb42ac8>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

config = BertConfig(
vocab_size=50000,
hidden_size=768,
num_hidden_layers=6,
num_attention_heads=12,
max_position_embeddings=512
)

model = BertForMaskedLM(config)
print('No of parameters: ', model.num_parameters())


# 做掩码机制
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
No of parameters:  82556240
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

training_args = TrainingArguments(
output_dir='./训练中文的bert输出/',
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=4,
save_steps=10_000,
save_total_limit=2,
)

trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
prediction_loss_only=True,
)
1
2
3
%%time
trainer.train()

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6124 [00:00<?, ?it/s]


{"loss": 7.0956006441116335, "learning_rate": 4.591770084911823e-05, "epoch": 0.08164598301763554, "step": 500}
{"loss": 6.214028715133667, "learning_rate": 4.183540169823645e-05, "epoch": 0.16329196603527107, "step": 1000}
{"loss": 5.860672056674957, "learning_rate": 3.775310254735467e-05, "epoch": 0.24493794905290658, "step": 1500}
{"loss": 5.599938702583313, "learning_rate": 3.367080339647289e-05, "epoch": 0.32658393207054215, "step": 2000}
{"loss": 5.412256263256073, "learning_rate": 2.958850424559112e-05, "epoch": 0.4082299150881777, "step": 2500}
{"loss": 5.261007954120636, "learning_rate": 2.550620509470934e-05, "epoch": 0.48987589810581317, "step": 3000}
{"loss": 5.095327672958374, "learning_rate": 2.1423905943827566e-05, "epoch": 0.5715218811234487, "step": 3500}
{"loss": NaN, "learning_rate": 1.734160679294579e-05, "epoch": 0.6531678641410843, "step": 4000}
{"loss": NaN, "learning_rate": 1.3259307642064011e-05, "epoch": 0.7348138471587198, "step": 4500}
{"loss": NaN, "learning_rate": 9.177008491182235e-06, "epoch": 0.8164598301763554, "step": 5000}
{"loss": NaN, "learning_rate": 5.094709340300458e-06, "epoch": 0.8981058131939909, "step": 5500}
{"loss": NaN, "learning_rate": 1.0124101894186806e-06, "epoch": 0.9797517962116263, "step": 6000}
Wall time: 17min 31s

TrainOutput(global_step=6124, training_loss=nan)
1
# trainer.save_model('./训练中文的bert输出')
1
2
3
4
5
6
7
8
9

model = BertForMaskedLM.from_pretrained('./')

fill_mask = pipeline(
"fill-mask",
model=model,
tokenizer=tokenizer
)

1
fill_mask('心[MASK]病')
[{'score': 0.3431047201156616,
  'token': 5552,
  'token_str': '脏',
  'sequence': '心 脏 病'},
 {'score': 0.07011183351278305,
  'token': 2552,
  'token_str': '心',
  'sequence': '心 心 病'},
 {'score': 0.05838495120406151,
  'token': 4567,
  'token_str': '病',
  'sequence': '心 病 病'},
 {'score': 0.014283978380262852,
  'token': 107,
  'token_str': '"',
  'sequence': '心 " 病'},
 {'score': 0.011550793424248695,
  'token': 7315,
  'token_str': '闷',
  'sequence': '心 闷 病'}]
1
fill_mask('怀[MASK]')
[{'score': 0.4942161738872528,
  'token': 2097,
  'token_str': '孕',
  'sequence': '怀 孕'},
 {'score': 0.050573259592056274,
  'token': 2577,
  'token_str': '怀',
  'sequence': '怀 怀'},
 {'score': 0.01493070088326931,
  'token': 107,
  'token_str': '"',
  'sequence': '怀 "'},
 {'score': 0.010810167528688908,
  'token': 5307,
  'token_str': '经',
  'sequence': '怀 经'},
 {'score': 0.00741073302924633,
  'token': 1453,
  'token_str': '周',
  'sequence': '怀 周'}]