LexBartLo_1 / README.md
Nevidu's picture
Update README.md
d73e977 verified
metadata
library_name: peft
base_model: facebook/bart-large

Model Card for Model ID

Model Details

Model Description

How to use the model

from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import pandas as pd

def preprocess_text(text):
    sentences = sent_tokenize(text)
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    return tokenized_sentences

def sentence_similarity(sentence1, sentence2):
    stop_words = set(stopwords.words('english'))
    filtered_sentence1 = [w for w in sentence1 if w not in stop_words]
    filtered_sentence2 = [w for w in sentence2 if w not in stop_words]
    all_words = list(set(filtered_sentence1 + filtered_sentence2))
    vector1 = [filtered_sentence1.count(word) for word in all_words]
    vector2 = [filtered_sentence2.count(word) for word in all_words]
    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
    return similarity_matrix

def apply_lexrank(similarity_matrix, damping=0.85, threshold=0.2, max_iter=100):
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph, alpha=damping, tol=threshold, max_iter=max_iter)
    return scores

def get_top_sentences(sentences, scores):
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)
    top_sentences = [sentence for score, sentence in ranked_sentences]
    return top_sentences

def extract_important_sentences(text):
    preprocessed_sentences = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    scores = apply_lexrank(similarity_matrix)
    top_sentences = get_top_sentences(preprocessed_sentences, scores)
    paragraph = ' '.join([' '.join(sentence) for sentence in top_sentences])
    return paragraph

def summarize(text, max_tokens):

    peft_model = "Nevidu/LexBartLo_1"
    config = PeftConfig.from_pretrained(peft_model)

    # load base LLM model and tokenizer
    model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

    # Load the Lora model
    model = PeftModel.from_pretrained(model, peft_model)

    sorted_text = extract_important_sentences(text)

    input_ids = tokenizer(sorted_text, return_tensors="pt", truncation=True).input_ids
    # with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, max_new_tokens=max_tokens, do_sample=True, top_p=0.9)
    summary = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    return summary

text = """ Add your textile patent text"""
max_tokens = 256

summary = summarize(text, max_tokens)

Citation

@inproceedings{jayatilleke2025hybrid,
  title={A Hybrid Architecture with Efficient Fine Tuning for Abstractive Patent Document Summarization},
  author={Jayatilleke, Nevidu and Weerasinghe, Ruvan},
  booktitle={2025 International Research Conference on Smart Computing and Systems Engineering (SCSE)},
  pages={1--6},
  year={2025},
  organization={IEEE}
}

Framework versions

  • PEFT 0.9.0