Codex PR
Browse files- Dockerfile +25 -0
- api/utils.py +3 -3
- app.py +184 -301
- requirements.txt +1 -7
- start.bat +8 -17
- start.py +23 -122
- start.sh +9 -19
Dockerfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# System deps for PDF parsing
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
build-essential \
|
| 6 |
+
poppler-utils \
|
| 7 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install Python deps first for better cache hits
|
| 12 |
+
COPY requirements.txt /app/requirements.txt
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy application code
|
| 16 |
+
COPY . /app
|
| 17 |
+
|
| 18 |
+
# Default model can be overridden in Space settings
|
| 19 |
+
ENV DEFAULT_MODEL=t5-small
|
| 20 |
+
ENV PORT=7860
|
| 21 |
+
|
| 22 |
+
EXPOSE 7860
|
| 23 |
+
|
| 24 |
+
# Start Streamlit on the expected port/interface
|
| 25 |
+
CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
|
api/utils.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import re
|
| 2 |
-
import nltk
|
| 3 |
from typing import List, Dict, Any
|
| 4 |
import logging
|
| 5 |
|
|
@@ -113,7 +112,8 @@ def get_text_statistics(text: str) -> Dict[str, Any]:
|
|
| 113 |
Get basic statistics about the text.
|
| 114 |
"""
|
| 115 |
words = text.split()
|
| 116 |
-
|
|
|
|
| 117 |
|
| 118 |
return {
|
| 119 |
'total_characters': len(text),
|
|
@@ -121,4 +121,4 @@ def get_text_statistics(text: str) -> Dict[str, Any]:
|
|
| 121 |
'total_sentences': len(sentences),
|
| 122 |
'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
|
| 123 |
'estimated_reading_time_minutes': len(words) / 200 # Average reading speed
|
| 124 |
-
}
|
|
|
|
| 1 |
import re
|
|
|
|
| 2 |
from typing import List, Dict, Any
|
| 3 |
import logging
|
| 4 |
|
|
|
|
| 112 |
Get basic statistics about the text.
|
| 113 |
"""
|
| 114 |
words = text.split()
|
| 115 |
+
# Lightweight sentence split to avoid NLTK downloads
|
| 116 |
+
sentences = [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+', text) if s.strip()]
|
| 117 |
|
| 118 |
return {
|
| 119 |
'total_characters': len(text),
|
|
|
|
| 121 |
'total_sentences': len(sentences),
|
| 122 |
'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
|
| 123 |
'estimated_reading_time_minutes': len(words) / 200 # Average reading speed
|
| 124 |
+
}
|
app.py
CHANGED
|
@@ -1,314 +1,197 @@
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
-
import
|
| 3 |
-
|
| 4 |
-
import
|
| 5 |
-
from
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
# Page configuration
|
| 9 |
st.set_page_config(
|
| 10 |
-
page_title="Book Summarizer
|
| 11 |
page_icon="π",
|
| 12 |
layout="wide",
|
| 13 |
-
initial_sidebar_state="expanded"
|
| 14 |
)
|
| 15 |
|
| 16 |
-
# API configuration
|
| 17 |
-
API_BASE_URL = "http://localhost:8000"
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
}
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
}
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
st.
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
with st.sidebar:
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
st.error("Failed to load models")
|
| 90 |
-
selected_model = "facebook/bart-large-cnn"
|
| 91 |
-
except Exception as e:
|
| 92 |
-
st.error(f"Error loading models: {str(e)}")
|
| 93 |
-
selected_model = "facebook/bart-large-cnn"
|
| 94 |
-
|
| 95 |
-
# Summary settings
|
| 96 |
-
st.subheader("Summary Settings")
|
| 97 |
-
max_length = st.slider("Maximum Summary Length", 50, 500, 150, help="Maximum number of words in the summary")
|
| 98 |
-
min_length = st.slider("Minimum Summary Length", 10, 200, 50, help="Minimum number of words in the summary")
|
| 99 |
-
|
| 100 |
-
# Advanced settings
|
| 101 |
-
with st.expander("Advanced Settings"):
|
| 102 |
-
chunk_size = st.slider("Chunk Size", 500, 2000, 1000, help="Size of text chunks for processing")
|
| 103 |
-
overlap = st.slider("Chunk Overlap", 50, 200, 100, help="Overlap between text chunks")
|
| 104 |
-
|
| 105 |
-
# API status
|
| 106 |
-
st.subheader("API Status")
|
| 107 |
-
try:
|
| 108 |
-
health_response = requests.get(f"{API_BASE_URL}/health")
|
| 109 |
-
if health_response.status_code == 200:
|
| 110 |
-
st.success("β
API Connected")
|
| 111 |
-
else:
|
| 112 |
-
st.error("β API Error")
|
| 113 |
-
except:
|
| 114 |
-
st.error("β API Unavailable")
|
| 115 |
-
|
| 116 |
-
# Main content
|
| 117 |
-
tab1, tab2, tab3 = st.tabs(["π Summarize Book", "π Text Analysis", "βΉοΈ About"])
|
| 118 |
-
|
| 119 |
-
with tab1:
|
| 120 |
-
st.header("π Book Summarization")
|
| 121 |
-
|
| 122 |
-
# File upload
|
| 123 |
-
uploaded_file = st.file_uploader(
|
| 124 |
-
"Choose a PDF book file",
|
| 125 |
-
type=['pdf'],
|
| 126 |
-
help="Upload a PDF file (max 50MB)"
|
| 127 |
-
)
|
| 128 |
-
|
| 129 |
-
if uploaded_file is not None:
|
| 130 |
-
# File info
|
| 131 |
-
file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # MB
|
| 132 |
-
st.info(f"π **File:** {uploaded_file.name} ({file_size:.1f} MB)")
|
| 133 |
-
|
| 134 |
-
# Validate file
|
| 135 |
-
if st.button("π Validate PDF", type="secondary"):
|
| 136 |
-
with st.spinner("Validating PDF..."):
|
| 137 |
-
try:
|
| 138 |
-
files = {"file": uploaded_file.getvalue()}
|
| 139 |
-
response = requests.post(f"{API_BASE_URL}/upload-pdf", files=files)
|
| 140 |
-
|
| 141 |
-
if response.status_code == 200:
|
| 142 |
-
data = response.json()
|
| 143 |
-
st.success(f"β
{data['message']}")
|
| 144 |
-
|
| 145 |
-
# Display metadata
|
| 146 |
-
metadata = data.get('metadata', {})
|
| 147 |
-
col1, col2, col3 = st.columns(3)
|
| 148 |
-
with col1:
|
| 149 |
-
st.metric("Pages", data['pages'])
|
| 150 |
-
with col2:
|
| 151 |
-
st.metric("Size", f"{data['size_mb']:.1f} MB")
|
| 152 |
-
with col3:
|
| 153 |
-
st.metric("Title", metadata.get('title', 'Unknown'))
|
| 154 |
-
else:
|
| 155 |
-
st.error(f"β Validation failed: {response.json().get('detail', 'Unknown error')}")
|
| 156 |
-
except Exception as e:
|
| 157 |
-
st.error(f"β Error: {str(e)}")
|
| 158 |
-
|
| 159 |
-
# Summarize button
|
| 160 |
-
if st.button("π Generate Summary", type="primary"):
|
| 161 |
-
if uploaded_file is not None:
|
| 162 |
-
with st.spinner("Processing your book..."):
|
| 163 |
-
try:
|
| 164 |
-
# Prepare request
|
| 165 |
-
files = {"file": uploaded_file.getvalue()}
|
| 166 |
-
data = {
|
| 167 |
-
"max_length": max_length,
|
| 168 |
-
"min_length": min_length,
|
| 169 |
-
"chunk_size": chunk_size,
|
| 170 |
-
"overlap": overlap,
|
| 171 |
-
"model_name": selected_model
|
| 172 |
-
}
|
| 173 |
-
|
| 174 |
-
# Send request
|
| 175 |
-
response = requests.post(f"{API_BASE_URL}/summarize", files=files, data=data)
|
| 176 |
-
|
| 177 |
-
if response.status_code == 200:
|
| 178 |
-
result = response.json()
|
| 179 |
-
|
| 180 |
-
# Display success message
|
| 181 |
-
st.success("β
Summary generated successfully!")
|
| 182 |
-
|
| 183 |
-
# Display statistics
|
| 184 |
-
col1, col2, col3, col4 = st.columns(4)
|
| 185 |
-
stats = result.get('statistics', {})
|
| 186 |
-
orig_stats = result.get('original_statistics', {})
|
| 187 |
-
|
| 188 |
-
with col1:
|
| 189 |
-
st.metric("Original Words", f"{orig_stats.get('total_words', 0):,}")
|
| 190 |
-
with col2:
|
| 191 |
-
st.metric("Summary Words", f"{stats.get('final_summary_length', 0):,}")
|
| 192 |
-
with col3:
|
| 193 |
-
compression = stats.get('overall_compression_ratio', 0)
|
| 194 |
-
st.metric("Compression", f"{compression:.1%}")
|
| 195 |
-
with col4:
|
| 196 |
-
st.metric("Chunks Processed", stats.get('total_chunks', 0))
|
| 197 |
-
|
| 198 |
-
# Display summary
|
| 199 |
-
st.subheader("π Generated Summary")
|
| 200 |
-
summary = result.get('summary', '')
|
| 201 |
-
st.text_area(
|
| 202 |
-
"Summary",
|
| 203 |
-
value=summary,
|
| 204 |
-
height=400,
|
| 205 |
-
disabled=True
|
| 206 |
-
)
|
| 207 |
-
|
| 208 |
-
# Download button
|
| 209 |
-
summary_bytes = summary.encode('utf-8')
|
| 210 |
-
st.download_button(
|
| 211 |
-
label="π₯ Download Summary",
|
| 212 |
-
data=summary_bytes,
|
| 213 |
-
file_name=f"{uploaded_file.name.replace('.pdf', '')}_summary.txt",
|
| 214 |
-
mime="text/plain"
|
| 215 |
-
)
|
| 216 |
-
|
| 217 |
-
else:
|
| 218 |
-
error_msg = response.json().get('detail', 'Unknown error')
|
| 219 |
-
st.error(f"β Summarization failed: {error_msg}")
|
| 220 |
-
|
| 221 |
-
except Exception as e:
|
| 222 |
-
st.error(f"β Error: {str(e)}")
|
| 223 |
-
|
| 224 |
-
with tab2:
|
| 225 |
-
st.header("π Text Analysis")
|
| 226 |
-
|
| 227 |
-
if uploaded_file is not None:
|
| 228 |
-
if st.button("π Analyze Text"):
|
| 229 |
-
with st.spinner("Analyzing text..."):
|
| 230 |
-
try:
|
| 231 |
-
files = {"file": uploaded_file.getvalue()}
|
| 232 |
-
response = requests.post(f"{API_BASE_URL}/extract-text", files=files)
|
| 233 |
-
|
| 234 |
-
if response.status_code == 200:
|
| 235 |
-
data = response.json()
|
| 236 |
-
stats = data.get('statistics', {})
|
| 237 |
-
|
| 238 |
-
# Display statistics
|
| 239 |
-
col1, col2, col3, col4 = st.columns(4)
|
| 240 |
-
|
| 241 |
-
with col1:
|
| 242 |
-
st.metric("Total Words", f"{stats.get('total_words', 0):,}")
|
| 243 |
-
with col2:
|
| 244 |
-
st.metric("Total Sentences", f"{stats.get('total_sentences', 0):,}")
|
| 245 |
-
with col3:
|
| 246 |
-
st.metric("Avg Words/Sentence", f"{stats.get('average_words_per_sentence', 0):.1f}")
|
| 247 |
-
with col4:
|
| 248 |
-
st.metric("Reading Time", f"{stats.get('estimated_reading_time_minutes', 0):.1f} min")
|
| 249 |
-
|
| 250 |
-
# Text preview
|
| 251 |
-
st.subheader("π Text Preview")
|
| 252 |
-
text_response = requests.post(f"{API_BASE_URL}/extract-text", files=files)
|
| 253 |
-
if text_response.status_code == 200:
|
| 254 |
-
text_data = text_response.json()
|
| 255 |
-
preview_text = text_data.get('text', '')[:1000] + "..." if len(text_data.get('text', '')) > 1000 else text_data.get('text', '')
|
| 256 |
-
st.text_area("First 1000 characters:", value=preview_text, height=200, disabled=True)
|
| 257 |
-
else:
|
| 258 |
-
st.error(f"β Analysis failed: {response.json().get('detail', 'Unknown error')}")
|
| 259 |
-
except Exception as e:
|
| 260 |
-
st.error(f"β Error: {str(e)}")
|
| 261 |
-
else:
|
| 262 |
-
st.info("π Please upload a PDF file to analyze its text.")
|
| 263 |
-
|
| 264 |
-
with tab3:
|
| 265 |
-
st.header("βΉοΈ About")
|
| 266 |
-
|
| 267 |
-
st.markdown("""
|
| 268 |
-
## π€ Book Summarizer AI
|
| 269 |
-
|
| 270 |
-
This application uses advanced AI models to automatically summarize PDF books.
|
| 271 |
-
It processes the text in chunks and generates comprehensive summaries while
|
| 272 |
-
maintaining the key information and context.
|
| 273 |
-
|
| 274 |
-
### β¨ Features
|
| 275 |
-
|
| 276 |
-
- **PDF Text Extraction**: Advanced PDF processing with fallback methods
|
| 277 |
-
- **AI Summarization**: State-of-the-art transformer models
|
| 278 |
-
- **Configurable Settings**: Adjust summary length and processing parameters
|
| 279 |
-
- **Multiple Models**: Choose from different AI models for various use cases
|
| 280 |
-
- **Text Analysis**: Detailed statistics about the book content
|
| 281 |
-
|
| 282 |
-
### π οΈ Technology Stack
|
| 283 |
-
|
| 284 |
-
- **Frontend**: Streamlit
|
| 285 |
-
- **Backend**: FastAPI
|
| 286 |
-
- **AI Models**: Hugging Face Transformers (BART, T5)
|
| 287 |
-
- **PDF Processing**: PyPDF2, pdfplumber
|
| 288 |
-
- **Text Processing**: NLTK
|
| 289 |
-
|
| 290 |
-
### π How It Works
|
| 291 |
-
|
| 292 |
-
1. **Upload**: Select a PDF book file (max 50MB)
|
| 293 |
-
2. **Extract**: The system extracts and cleans text from the PDF
|
| 294 |
-
3. **Chunk**: Large texts are split into manageable chunks
|
| 295 |
-
4. **Summarize**: AI models process each chunk and generate summaries
|
| 296 |
-
5. **Combine**: Individual summaries are combined into a final summary
|
| 297 |
-
6. **Download**: Get your summary in text format
|
| 298 |
-
|
| 299 |
-
### π Getting Started
|
| 300 |
-
|
| 301 |
-
1. Make sure the API server is running (`uvicorn api.main:app --reload`)
|
| 302 |
-
2. Upload a PDF book file
|
| 303 |
-
3. Configure your preferred settings
|
| 304 |
-
4. Click "Generate Summary" and wait for processing
|
| 305 |
-
5. Download your AI-generated summary
|
| 306 |
-
|
| 307 |
-
### π Support
|
| 308 |
-
|
| 309 |
-
For issues or questions, please check the API documentation at `/docs`
|
| 310 |
-
when the server is running.
|
| 311 |
-
""")
|
| 312 |
|
| 313 |
if __name__ == "__main__":
|
| 314 |
-
main()
|
|
|
|
| 1 |
+
import os
|
| 2 |
import streamlit as st
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
|
| 5 |
+
from api.pdf_processor import PDFProcessor
|
| 6 |
+
from api.summarizer import BookSummarizer
|
| 7 |
+
|
| 8 |
+
DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "sshleifer/distilbart-cnn-12-6")
|
| 9 |
+
AVAILABLE_MODELS = BookSummarizer(DEFAULT_MODEL).get_available_models()
|
| 10 |
+
|
| 11 |
|
|
|
|
| 12 |
st.set_page_config(
|
| 13 |
+
page_title="Book Summarizer",
|
| 14 |
page_icon="π",
|
| 15 |
layout="wide",
|
| 16 |
+
initial_sidebar_state="expanded",
|
| 17 |
)
|
| 18 |
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
@st.cache_resource
|
| 21 |
+
def get_pdf_processor() -> PDFProcessor:
|
| 22 |
+
return PDFProcessor()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@st.cache_resource
|
| 26 |
+
def get_summarizer(model_name: str) -> BookSummarizer:
|
| 27 |
+
summarizer = BookSummarizer(model_name=model_name)
|
| 28 |
+
summarizer.load_model()
|
| 29 |
+
return summarizer
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def summarize_pdf(
|
| 33 |
+
uploaded_file,
|
| 34 |
+
model_name: str,
|
| 35 |
+
max_length: int,
|
| 36 |
+
min_length: int,
|
| 37 |
+
chunk_size: int,
|
| 38 |
+
overlap: int,
|
| 39 |
+
) -> Dict[str, Any]:
|
| 40 |
+
pdf_bytes = uploaded_file.getvalue()
|
| 41 |
+
processor = get_pdf_processor()
|
| 42 |
+
|
| 43 |
+
validation = processor.validate_pdf(pdf_bytes)
|
| 44 |
+
if not validation["valid"]:
|
| 45 |
+
raise ValueError(validation["message"])
|
| 46 |
+
|
| 47 |
+
metadata = processor.get_pdf_metadata(pdf_bytes)
|
| 48 |
+
extraction = processor.extract_text_from_pdf(pdf_bytes)
|
| 49 |
+
if not extraction["success"]:
|
| 50 |
+
raise RuntimeError(extraction["message"])
|
| 51 |
+
|
| 52 |
+
summarizer = get_summarizer(model_name)
|
| 53 |
+
summary_result = summarizer.summarize_book(
|
| 54 |
+
text=extraction["text"],
|
| 55 |
+
chunk_size=chunk_size,
|
| 56 |
+
overlap=overlap,
|
| 57 |
+
max_length=max_length,
|
| 58 |
+
min_length=min_length,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
if not summary_result["success"]:
|
| 62 |
+
raise RuntimeError(summary_result.get("error", "Summarization failed"))
|
| 63 |
+
|
| 64 |
+
return {
|
| 65 |
+
"metadata": metadata,
|
| 66 |
+
"validation": validation,
|
| 67 |
+
"extraction": extraction,
|
| 68 |
+
"summary": summary_result,
|
| 69 |
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def sidebar_controls():
|
| 73 |
+
st.header("Settings")
|
| 74 |
+
|
| 75 |
+
model_names = [m["name"] for m in AVAILABLE_MODELS]
|
| 76 |
+
model_descriptions = {m["name"]: m["description"] for m in AVAILABLE_MODELS}
|
| 77 |
+
|
| 78 |
+
selected_model = st.selectbox(
|
| 79 |
+
"Model",
|
| 80 |
+
model_names,
|
| 81 |
+
index=model_names.index(DEFAULT_MODEL) if DEFAULT_MODEL in model_names else 0,
|
| 82 |
+
help="Free, locally run Hugging Face models. First run downloads weights.",
|
| 83 |
+
)
|
| 84 |
+
st.caption(model_descriptions.get(selected_model, ""))
|
| 85 |
+
|
| 86 |
+
max_length = st.slider(
|
| 87 |
+
"Maximum summary length (words)",
|
| 88 |
+
min_value=50,
|
| 89 |
+
max_value=250,
|
| 90 |
+
value=140,
|
| 91 |
+
step=10,
|
| 92 |
+
)
|
| 93 |
+
min_length = st.slider(
|
| 94 |
+
"Minimum summary length (words)",
|
| 95 |
+
min_value=20,
|
| 96 |
+
max_value=min_length_limit := min(120, max_length - 10),
|
| 97 |
+
value=min(50, max_length - 20),
|
| 98 |
+
step=5,
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
chunk_size = st.slider(
|
| 102 |
+
"Chunk size (characters)",
|
| 103 |
+
min_value=600,
|
| 104 |
+
max_value=2000,
|
| 105 |
+
value=1200,
|
| 106 |
+
step=50,
|
| 107 |
+
help="Longer chunks preserve context but take longer.",
|
| 108 |
+
)
|
| 109 |
+
overlap = st.slider(
|
| 110 |
+
"Chunk overlap (characters)",
|
| 111 |
+
min_value=50,
|
| 112 |
+
max_value=300,
|
| 113 |
+
value=120,
|
| 114 |
+
step=10,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return {
|
| 118 |
+
"model": selected_model,
|
| 119 |
+
"max_length": max_length,
|
| 120 |
+
"min_length": min_length,
|
| 121 |
+
"chunk_size": chunk_size,
|
| 122 |
+
"overlap": overlap,
|
| 123 |
}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def show_file_info(uploaded_file):
|
| 127 |
+
size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
|
| 128 |
+
st.info(f"Selected: **{uploaded_file.name}** ({size_mb:.1f} MB)")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def show_results(result: Dict[str, Any]):
|
| 132 |
+
summary_text = result["summary"]["summary"]
|
| 133 |
+
stats = result["summary"]["statistics"]
|
| 134 |
+
original_stats = result["extraction"]["statistics"]
|
| 135 |
+
|
| 136 |
+
st.success("Summary ready!")
|
| 137 |
+
|
| 138 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 139 |
+
col1.metric("Pages", result["validation"]["pages"])
|
| 140 |
+
col2.metric("Original words", f"{original_stats.get('total_words', 0):,}")
|
| 141 |
+
col3.metric("Summary words", f"{stats.get('final_summary_length', 0):,}")
|
| 142 |
+
compression = stats.get("overall_compression_ratio", 0)
|
| 143 |
+
col4.metric("Compression", f"{compression:.1%}" if compression else "N/A")
|
| 144 |
+
|
| 145 |
+
st.subheader("Summary")
|
| 146 |
+
st.text_area("Generated summary", value=summary_text, height=400, label_visibility="collapsed")
|
| 147 |
+
|
| 148 |
+
st.download_button(
|
| 149 |
+
label="Download summary",
|
| 150 |
+
data=summary_text.encode("utf-8"),
|
| 151 |
+
file_name=f"{result['metadata'].get('title', 'summary').replace(' ', '_')}.txt",
|
| 152 |
+
mime="text/plain",
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
st.subheader("Book snapshot")
|
| 156 |
+
preview = result["extraction"]["text"][:1500]
|
| 157 |
+
if len(result["extraction"]["text"]) > 1500:
|
| 158 |
+
preview += " ..."
|
| 159 |
+
st.text_area("First 1500 characters", value=preview, height=220, label_visibility="collapsed")
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def main():
|
| 163 |
+
st.title("π AI-Powered Book Summarizer")
|
| 164 |
+
st.write(
|
| 165 |
+
"Upload a PDF (under 50MB) to generate a concise summary locally with free, open models. "
|
| 166 |
+
"No paid API keys requiredβfirst run will download model weights."
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
st.divider()
|
| 170 |
+
|
| 171 |
with st.sidebar:
|
| 172 |
+
controls = sidebar_controls()
|
| 173 |
+
|
| 174 |
+
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
| 175 |
+
|
| 176 |
+
if uploaded_file:
|
| 177 |
+
show_file_info(uploaded_file)
|
| 178 |
+
if st.button("Generate summary", type="primary"):
|
| 179 |
+
with st.spinner("Extracting text and generating summary..."):
|
| 180 |
+
try:
|
| 181 |
+
result = summarize_pdf(
|
| 182 |
+
uploaded_file=uploaded_file,
|
| 183 |
+
model_name=controls["model"],
|
| 184 |
+
max_length=controls["max_length"],
|
| 185 |
+
min_length=controls["min_length"],
|
| 186 |
+
chunk_size=controls["chunk_size"],
|
| 187 |
+
overlap=controls["overlap"],
|
| 188 |
+
)
|
| 189 |
+
show_results(result)
|
| 190 |
+
except Exception as exc:
|
| 191 |
+
st.error(f"Could not summarize this PDF: {exc}")
|
| 192 |
+
else:
|
| 193 |
+
st.info("Upload a small/medium PDF to get started. Scans or image-only PDFs will not work well.")
|
| 194 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
if __name__ == "__main__":
|
| 197 |
+
main()
|
requirements.txt
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
streamlit==1.28.1
|
| 2 |
-
fastapi==0.104.1
|
| 3 |
-
uvicorn==0.24.0
|
| 4 |
-
python-multipart==0.0.6
|
| 5 |
PyPDF2==3.0.1
|
| 6 |
pdfplumber==0.10.3
|
| 7 |
transformers==4.35.2
|
| 8 |
torch>=2.2.0
|
| 9 |
-
|
| 10 |
-
requests==2.31.0
|
| 11 |
-
python-dotenv==1.0.0
|
| 12 |
-
pydantic==2.5.0
|
|
|
|
| 1 |
streamlit==1.28.1
|
|
|
|
|
|
|
|
|
|
| 2 |
PyPDF2==3.0.1
|
| 3 |
pdfplumber==0.10.3
|
| 4 |
transformers==4.35.2
|
| 5 |
torch>=2.2.0
|
| 6 |
+
sentencepiece>=0.1.99
|
|
|
|
|
|
|
|
|
start.bat
CHANGED
|
@@ -1,32 +1,23 @@
|
|
| 1 |
@echo off
|
| 2 |
-
echo
|
| 3 |
-
echo
|
| 4 |
|
| 5 |
-
echo.
|
| 6 |
-
echo π§ Checking Python installation...
|
| 7 |
python --version >nul 2>&1
|
| 8 |
if errorlevel 1 (
|
| 9 |
-
echo
|
| 10 |
-
echo Please install Python from https://python.org
|
| 11 |
pause
|
| 12 |
exit /b 1
|
| 13 |
)
|
| 14 |
|
| 15 |
-
echo
|
| 16 |
-
|
| 17 |
-
echo.
|
| 18 |
-
echo π¦ Installing dependencies...
|
| 19 |
pip install -r requirements.txt
|
| 20 |
if errorlevel 1 (
|
| 21 |
-
echo
|
| 22 |
pause
|
| 23 |
exit /b 1
|
| 24 |
)
|
| 25 |
|
| 26 |
-
echo
|
| 27 |
-
|
| 28 |
-
echo.
|
| 29 |
-
echo π Starting Book Summarizer AI...
|
| 30 |
-
python start.py
|
| 31 |
|
| 32 |
-
pause
|
|
|
|
| 1 |
@echo off
|
| 2 |
+
echo Book Summarizer - Windows Startup
|
| 3 |
+
echo ================================
|
| 4 |
|
|
|
|
|
|
|
| 5 |
python --version >nul 2>&1
|
| 6 |
if errorlevel 1 (
|
| 7 |
+
echo Python is not installed or not in PATH.
|
|
|
|
| 8 |
pause
|
| 9 |
exit /b 1
|
| 10 |
)
|
| 11 |
|
| 12 |
+
echo Installing dependencies (if needed)...
|
|
|
|
|
|
|
|
|
|
| 13 |
pip install -r requirements.txt
|
| 14 |
if errorlevel 1 (
|
| 15 |
+
echo Failed to install dependencies.
|
| 16 |
pause
|
| 17 |
exit /b 1
|
| 18 |
)
|
| 19 |
|
| 20 |
+
echo Launching Streamlit...
|
| 21 |
+
python -m streamlit run app.py --server.port 8501 --server.address 0.0.0.0
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
pause
|
start.py
CHANGED
|
@@ -1,135 +1,36 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
-
This script helps you start both the FastAPI backend and Streamlit frontend.
|
| 5 |
"""
|
| 6 |
|
| 7 |
import subprocess
|
| 8 |
import sys
|
| 9 |
-
import time
|
| 10 |
-
import requests
|
| 11 |
-
import os
|
| 12 |
-
from pathlib import Path
|
| 13 |
|
| 14 |
-
def check_dependencies():
|
| 15 |
-
"""Check if required packages are installed."""
|
| 16 |
-
required_packages = [
|
| 17 |
-
'streamlit', 'fastapi', 'uvicorn', 'transformers',
|
| 18 |
-
'torch', 'PyPDF2', 'pdfplumber', 'nltk'
|
| 19 |
-
]
|
| 20 |
-
|
| 21 |
-
missing_packages = []
|
| 22 |
-
for package in required_packages:
|
| 23 |
-
try:
|
| 24 |
-
__import__(package)
|
| 25 |
-
except ImportError:
|
| 26 |
-
missing_packages.append(package)
|
| 27 |
-
|
| 28 |
-
if missing_packages:
|
| 29 |
-
print("β Missing required packages:")
|
| 30 |
-
for package in missing_packages:
|
| 31 |
-
print(f" - {package}")
|
| 32 |
-
print("\nπ¦ Install them with: pip install -r requirements.txt")
|
| 33 |
-
return False
|
| 34 |
-
|
| 35 |
-
print("β
All dependencies are installed")
|
| 36 |
-
return True
|
| 37 |
|
| 38 |
-
def
|
| 39 |
-
"
|
| 40 |
-
|
| 41 |
-
import nltk
|
| 42 |
-
nltk.download('punkt', quiet=True)
|
| 43 |
-
nltk.download('stopwords', quiet=True)
|
| 44 |
-
print("β
NLTK data downloaded")
|
| 45 |
-
except Exception as e:
|
| 46 |
-
print(f"β οΈ Warning: Could not download NLTK data: {e}")
|
| 47 |
-
|
| 48 |
-
def check_api_health():
|
| 49 |
-
"""Check if the API is running and healthy."""
|
| 50 |
-
try:
|
| 51 |
-
response = requests.get("http://localhost:8000/health", timeout=5)
|
| 52 |
-
return response.status_code == 200
|
| 53 |
-
except:
|
| 54 |
-
return False
|
| 55 |
-
|
| 56 |
-
def start_api():
|
| 57 |
-
"""Start the FastAPI backend."""
|
| 58 |
-
print("π Starting FastAPI backend...")
|
| 59 |
-
|
| 60 |
-
# Check if API is already running
|
| 61 |
-
if check_api_health():
|
| 62 |
-
print("β
API is already running")
|
| 63 |
-
return True
|
| 64 |
-
|
| 65 |
-
try:
|
| 66 |
-
# Start the API server
|
| 67 |
-
api_process = subprocess.Popen([
|
| 68 |
-
sys.executable, "-m", "uvicorn",
|
| 69 |
-
"api.main:app",
|
| 70 |
-
"--reload",
|
| 71 |
-
"--port", "8000",
|
| 72 |
-
"--host", "0.0.0.0"
|
| 73 |
-
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 74 |
-
|
| 75 |
-
# Wait for API to start
|
| 76 |
-
print("β³ Waiting for API to start...")
|
| 77 |
-
for i in range(30): # Wait up to 30 seconds
|
| 78 |
-
time.sleep(1)
|
| 79 |
-
if check_api_health():
|
| 80 |
-
print("β
API started successfully")
|
| 81 |
-
return True
|
| 82 |
-
|
| 83 |
-
print("β API failed to start within 30 seconds")
|
| 84 |
-
return False
|
| 85 |
-
|
| 86 |
-
except Exception as e:
|
| 87 |
-
print(f"β Error starting API: {e}")
|
| 88 |
-
return False
|
| 89 |
-
|
| 90 |
-
def start_frontend():
|
| 91 |
-
"""Start the Streamlit frontend."""
|
| 92 |
-
print("π Starting Streamlit frontend...")
|
| 93 |
-
|
| 94 |
try:
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
except KeyboardInterrupt:
|
| 102 |
-
print("\
|
| 103 |
-
except Exception as e:
|
| 104 |
-
print(f"β Error starting frontend: {e}")
|
| 105 |
|
| 106 |
-
def main():
|
| 107 |
-
"""Main startup function."""
|
| 108 |
-
print("π Book Summarizer AI - Startup")
|
| 109 |
-
print("=" * 40)
|
| 110 |
-
|
| 111 |
-
# Check dependencies
|
| 112 |
-
if not check_dependencies():
|
| 113 |
-
sys.exit(1)
|
| 114 |
-
|
| 115 |
-
# Download NLTK data
|
| 116 |
-
download_nltk_data()
|
| 117 |
-
|
| 118 |
-
print("\nπ§ Starting services...")
|
| 119 |
-
|
| 120 |
-
# Start API
|
| 121 |
-
if not start_api():
|
| 122 |
-
print("β Failed to start API. Please check the logs.")
|
| 123 |
-
sys.exit(1)
|
| 124 |
-
|
| 125 |
-
print("\nπ Ready! Opening the application...")
|
| 126 |
-
print("π Frontend: http://localhost:8501")
|
| 127 |
-
print("π API: http://localhost:8000")
|
| 128 |
-
print("π API Docs: http://localhost:8000/docs")
|
| 129 |
-
print("\nπ‘ Press Ctrl+C to stop the application")
|
| 130 |
-
|
| 131 |
-
# Start frontend
|
| 132 |
-
start_frontend()
|
| 133 |
|
| 134 |
if __name__ == "__main__":
|
| 135 |
-
main()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Minimal launcher for the Streamlit book summarizer.
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import subprocess
|
| 7 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
def main():
|
| 11 |
+
print("Starting Streamlit app...")
|
| 12 |
+
print("If dependencies are missing, install with: pip install -r requirements.txt\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
try:
|
| 14 |
+
subprocess.run(
|
| 15 |
+
[
|
| 16 |
+
sys.executable,
|
| 17 |
+
"-m",
|
| 18 |
+
"streamlit",
|
| 19 |
+
"run",
|
| 20 |
+
"app.py",
|
| 21 |
+
"--server.port",
|
| 22 |
+
"8501",
|
| 23 |
+
"--server.address",
|
| 24 |
+
"0.0.0.0",
|
| 25 |
+
],
|
| 26 |
+
check=True,
|
| 27 |
+
)
|
| 28 |
+
except subprocess.CalledProcessError as exc:
|
| 29 |
+
print(f"Streamlit exited with an error: {exc}")
|
| 30 |
+
sys.exit(exc.returncode)
|
| 31 |
except KeyboardInterrupt:
|
| 32 |
+
print("\nStopping Streamlit...")
|
|
|
|
|
|
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
if __name__ == "__main__":
|
| 36 |
+
main()
|
start.sh
CHANGED
|
@@ -1,28 +1,18 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
-
echo "
|
| 4 |
-
echo "
|
| 5 |
|
| 6 |
-
echo ""
|
| 7 |
-
echo "π§ Checking Python installation..."
|
| 8 |
if ! command -v python3 &> /dev/null; then
|
| 9 |
-
echo "
|
| 10 |
-
echo "Please install Python 3 from https://python.org"
|
| 11 |
exit 1
|
| 12 |
fi
|
| 13 |
|
| 14 |
-
echo "
|
| 15 |
-
|
| 16 |
-
echo ""
|
| 17 |
-
echo "π¦ Installing dependencies..."
|
| 18 |
-
pip3 install -r requirements.txt
|
| 19 |
-
if [ $? -ne 0 ]; then
|
| 20 |
-
echo "β Failed to install dependencies"
|
| 21 |
exit 1
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
echo "β
Dependencies installed"
|
| 25 |
|
| 26 |
-
echo ""
|
| 27 |
-
|
| 28 |
-
python3 start.py
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
+
echo "Book Summarizer - Startup"
|
| 4 |
+
echo "========================="
|
| 5 |
|
|
|
|
|
|
|
| 6 |
if ! command -v python3 &> /dev/null; then
|
| 7 |
+
echo "Python 3 is not installed or not in PATH."
|
|
|
|
| 8 |
exit 1
|
| 9 |
fi
|
| 10 |
|
| 11 |
+
echo "Installing dependencies (if needed)..."
|
| 12 |
+
pip3 install -r requirements.txt || {
|
| 13 |
+
echo "Failed to install dependencies."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
exit 1
|
| 15 |
+
}
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
echo "Launching Streamlit..."
|
| 18 |
+
python3 -m streamlit run app.py --server.port 8501 --server.address 0.0.0.0
|
|
|