基於預訓練模型Bart的英文文字摘要summary生成

語言: CN / TW / HK

本文已參與「新人創作禮」活動,一起開啟掘金創作之路。

內容同:https://blog.csdn.net/yuhengshi/article/details/120970903

環境

  • python==3.7
  • transformers==4.9.2
  • rouge-score==0.0.4

資料準備

將資料放在一個txt中,每行為一條,文章正文跟label的摘要用\t分割

構建資料集

```python from datasets import Dataset

class Data: def init(self, data_path, tokenizer): self.path = data_path self.max_input_length = 1024 self.max_target_length = 150 # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path) self.tokenizer = tokenizer

def preprocess(self, train_scale=0.8):
    with open(self.path,'r') as f:
        raw_data = f.readlines()
    print(f"=======data_len: {len(raw_data)}")
    start = int(len(raw_data)*train_scale)
    print(f"======train_len: {start}")

    raw_train_data = raw_data[:start]
    raw_test_data = raw_data[start:]
    raw_train_test_data = {'train':{'id':[],'document':[],'summary':[]}, \
                           'test':{'id':[],'document':[],'summary':[]}}        
    for i,item in enumerate(raw_train_data):
        if len(item.split('\t')) != 3:
            continue
        url,text,label = item.split('\t')
        raw_train_test_data['train']['id'].append(i)

        # document 是訓練資料, summary是label
        raw_train_test_data['train']['summary'].append(label.strip())
        raw_train_test_data['train']['document'].append(text.strip())

    for j,item in enumerate(raw_test_data):
        if len(item.split('\t')) != 3:
            continue
        url,text,label = item.split('\t')
        raw_train_test_data['test']['id'].append(i+j+1)
        raw_train_test_data['test']['summary'].append(label.strip())
        raw_train_test_data['test']['document'].append(text.strip())

    def preprocess_function(examples):
        # document 是訓練資料
        inputs = examples['document']
        model_inputs = self.tokenizer(inputs, max_length = self.max_input_length, padding = 'max_length', truncation=True)
        # summary是label
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(examples['summary'], max_length = self.max_target_length, padding = 'max_length', truncation = True)
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    train_dataset = Dataset.from_dict(raw_train_test_data['train'])
    test_dataset = Dataset.from_dict(raw_train_test_data['test'])
    tokenized_train_dataset = train_dataset.map(preprocess_function)
    tokenized_test_dataset = test_dataset.map(preprocess_function)
    return tokenized_train_dataset, tokenized_test_dataset

```


模型載入

```python from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer from transformers import BartForConditionalGeneration

checkpoint = "distilbart-xsum-9-6" model = BartForConditionalGeneration.from_pretrained(checkpoint) tokenizer = AutoTokenizer.from_pretrained(checkpoint) ```


Metrics

```python from rouge_score import rouge_scorer, scoring

def compute(predictions, references, rouge_types=None, use_agregator=True, use_stemmer=False): if rouge_types is None: rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
if use_agregator:
    aggregator = scoring.BootstrapAggregator()
else:
    scores = []

for ref, pred in zip(references, predictions):
    score = scorer.score(ref, pred)
    if use_agregator:
        aggregator.add_scores(score)
    else:
        scores.append(score)

if use_agregator:
    result = aggregator.aggregate()
else:
    result = {}
    for key in scores[0]:
        result[key] = list(score[key] for score in scores)

return result

metrics

def compute_metrics(eval_pred): predictions, labels = eval_pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Rouge expects a newline after each sentence
decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

result = compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
# Extract a few results
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

# Add mean generated length
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
result["gen_len"] = np.mean(prediction_lens)

return {k: round(v, 4) for k, v in result.items()}

```

訓練

超參配置

python batch_size = 1 args = Seq2SeqTrainingArguments( \ "/data/yuhengshi/europe_summary/model", \ evaluation_strategy = 'steps', \ learning_rate = 3e-5, \ per_device_train_batch_size = batch_size, \ per_device_eval_batch_size = batch_size, \ weight_decay = 0.1, \ save_steps = 200, \ save_total_limit = 10, \ num_train_epochs = 5, \ predict_with_generate = True, \ fp16 = True, \ eval_steps = 200, \ logging_dir="/data/yuhengshi/europe_summary/log", \ logging_first_step=True)

transformers api訓練

python data_collator = DataCollatorForSeq2Seq(tokenizer, model = model, padding=True) data = Data('/data/yuhengshi/europe_summary/data_no_daily_news.txt', tokenizer) tokenized_train_dataset, tokenized_test_dataset = data.preprocess() trainer =Seq2SeqTrainer( \ model, \ args, \ train_dataset = tokenized_train_dataset, \ eval_dataset = tokenized_test_dataset, \ data_collator = data_collator, \ tokenizer = tokenizer, \ compute_metrics = compute_metrics)

結果

從下面step中選loss跟rouge都比較好的 train 結果


預測 生成summary

python def predict(sentence): inputs = tokenizer([sentence],max_length = 1024, return_tensors='pt') summary_ids = model.generate(inputs['input_ids'], num_beams=70, max_length=150,min_length=50,early_stopping=True) summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids] return ' '.join(summary)