File size: 2,244 Bytes
4142a7d 1457f73 4b1ed8b de3b0ca 4142a7d 2e9079e de301cb 2e9079e 724a5a1 4b1ed8b 20be358 4d3ca74 de3b0ca 4d3ca74 4b1ed8b 2e9079e 4b1ed8b de3b0ca 4b1ed8b eb51c13 2e9079e de301cb eb51c13 de3b0ca eb51c13 20be358 de3b0ca 17ad421 eb51c13 2e9079e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import streamlit as st
from transformers import pipeline
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
st.title('Hugging Face BERT Summarizer')
# List of models
models = ["sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn", "t5-base", "t5-large", "google/pegasus-newsroom"]
# Dropdown model selector
model = st.sidebar.selectbox("Choose a model", models)
uploaded_file = st.file_uploader("Choose a .txt file", type="txt")
# Add text input for keywords
keywords = st.text_input("Enter keywords (comma-separated)")
# Add slider to the sidebar for the scale value
scale_percentage = st.sidebar.slider('Scale %', min_value=1, max_value=100, value=50)
# Add slider for the chunk size
chunk_size = st.sidebar.slider('Chunk size (words)', min_value=100, max_value=1000, value=500)
if uploaded_file is not None and keywords:
user_input = uploaded_file.read().decode('utf-8')
keywords = [keyword.strip() for keyword in keywords.split(",")]
# Split text into sentences
sentences = sent_tokenize(user_input)
# Filter sentences based on keywords
filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
filtered_text = ' '.join(filtered_sentences)
if st.button('Summarize'):
summarizer = pipeline('summarization', model=model)
summarized_text = ""
# Split filtered text into chunks by words
words = filtered_text.split()
chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
# Summarize each chunk
for chunk in chunks:
chunk_length = len(chunk.split())
min_length_percentage = max(scale_percentage - 10, 1)
max_length_percentage = min(scale_percentage + 10, 100)
min_length = max(int(chunk_length * min_length_percentage / 100), 1)
max_length = int(chunk_length * max_length_percentage / 100)
summarized = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
summarized_text += summarized[0]['summary_text'] + " "
st.text_area('Summarized Text', summarized_text, height=200)
|