Spaces:

dohyune
/

PLOBIN

Sleeping

App Files Files Community

dohyune commited on Nov 28, 2025

Commit

41800ca

verified ·

1 Parent(s): 4571aae

Update app.py

Browse files

Files changed (1) hide show

app.py +833 -238

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 PLOBIN
 """
 import streamlit as st
 import streamlit.components.v1 as components
 import fitz  # PyMuPDF
@@ -17,6 +18,7 @@ import base64
 from dotenv import load_dotenv
 import json
 from difflib import SequenceMatcher
 def get_svg_content(svg_path):
     with open(svg_path, "r", encoding="utf-8") as f:
@@ -28,6 +30,8 @@ load_dotenv()
 GROK_API_KEY = os.getenv("GROK_API_KEY")
 GROK_API_BASE = "https://api.x.ai/v1"
 CHROMA_DIR = "./chroma_db"
 EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
@@ -46,11 +50,13 @@ st.markdown("""
 <style>
     [data-testid="stSidebar"] {
         background: linear-gradient(180deg,
-            #618FC2 0%,
-            #8E969E 100%);
-        box-shadow: 4px 0 30px rgba(0,0,0,0.2);
-        width: 290px !important;
     }
     [data-testid="stSidebar"] h1 {
         color: white !important;
@@ -62,7 +68,7 @@ st.markdown("""
         animation: sidebarTitlePulse 4s ease-in-out infinite;
         letter-spacing: 2px;
     }
     @keyframes sidebarTitlePulse {
         0%, 100% {
             transform: scale(1);
@@ -82,13 +88,15 @@ st.markdown("""
     }
     [data-testid="stSidebar"] [data-testid="stFileUploader"] {
-        background: rgba(255,255,255,0.15);
         border-radius: 15px;
         padding: 1.5rem;
-        border: 3px dashed rgba(255,255,255,0.4);
         transition: all 0.3s ease;
         backdrop-filter: blur(10px);
     }
     [data-testid="stFileUploader"] > section {
         background: transparent !important;
@@ -99,9 +107,15 @@ st.markdown("""
     }
     [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] {
-        color: #fafafa;
     }
     [data-testid="stSidebar"] [data-testid="stFileUploader"] > section,
     [data-testid="stSidebar"] [data-testid="stFileUploader"] section > div {
         background: transparent !important;
@@ -109,19 +123,19 @@ st.markdown("""
     }
     [data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p {
-        color: rgba(255,255,255,0.9) !important;
     }
     [data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] {
-        background: rgba(255,255,255,0.2) !important;
-        color: white !important;
-        border: 1px solid rgba(255,255,255,0.3) !important;
     }
     [data-testid="stSidebar"] .stButton button {
-        background: rgba(255,255,255,0.15) !important;
-        color: white !important;
-        border: 2px solid rgba(255,255,255,0.4) !important;
         border-radius: 12px !important;
         font-weight: 700 !important;
         padding: 0.75rem 1.5rem !important;
@@ -131,10 +145,9 @@ st.markdown("""
     }
     [data-testid="stSidebar"] .stButton button:hover {
-        background: rgba(255,255,255,0.25) !important;
-        border-color: rgba(255,255,255,0.6) !important;
         transform: translateY(-2px) scale(1.02) !important;
-        box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important;
     }
     [data-testid="stSidebar"] .stButton button:active {
@@ -153,13 +166,48 @@ st.markdown("""
     }
     [data-testid="stSidebar"] [data-testid="stAlert"] {
-        background-color: rgba(255, 255, 255, 0.001) !important;
         border-radius: 0.5rem !important;
     }
     [data-testid="stAlert"] p {
-        color: rgb(250, 250, 250);
     }
     .main .block-container {
         max-width: 100%;
@@ -370,9 +418,76 @@ st.markdown("""
         box-shadow: 0 4px 8px rgba(234, 179, 8, 0.3) !important;
         background: linear-gradient(135deg, #FDE047 0%, #FACC15 100%) !important;
     }
 </style>
 """, unsafe_allow_html=True)
 def init_session():
     if 'processed' not in st.session_state:
@@ -401,7 +516,125 @@ def init_session():
         st.session_state.scroll_to_page = None
 def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
     pdf_bytes = pdf_file.read()
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
@@ -412,54 +645,189 @@ def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]
     CHUNK_SIZE = 800
     OVERLAP_SIZE = 150
-    for page_num in range(len(doc)):
-        page = doc[page_num]
-        text = page.get_text("text")
-        pages_text[page_num + 1] = text
-        if not text.strip():
-            continue
-        lines = [line.strip() for line in text.split('\n') if line.strip()]
-        cleaned_text = '\n'.join(lines)
-        sentences = re.split(r'([.!?]\s+|\n{2,})', cleaned_text)
-        sentences = [s for s in sentences if s.strip()]
-        current_chunk = ""
-        current_length = 0
-        for sentence in sentences:
-            sentence_length = len(sentence)
-            if current_length + sentence_length > CHUNK_SIZE and current_chunk:
-                chunks.append(current_chunk.strip())
-                metadata_list.append({
-                    "page": page_num + 1,
-                    "source": pdf_file.name,
-                    "chunk_type": "paragraph"
-                })
-                overlap_text = current_chunk[-OVERLAP_SIZE:] if len(current_chunk) > OVERLAP_SIZE else current_chunk
-                current_chunk = overlap_text + sentence
-                current_length = len(current_chunk)
             else:
-                current_chunk += sentence
-                current_length += sentence_length
-        if current_chunk.strip():
-            chunks.append(current_chunk.strip())
-            metadata_list.append({
-                "page": page_num + 1,
-                "source": pdf_file.name,
-                "chunk_type": "paragraph"
-            })
     doc.close()
     return chunks, metadata_list, pdf_bytes, pages_text
-@st.cache_resource
 def load_embedding_model():
     return SentenceTransformer(EMBEDDING_MODEL)
@@ -555,12 +923,21 @@ def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
         vector_score = 1 - vector_results['distances'][0][i]
         keyword_score = 0
         doc_lower = doc.lower()
         for keyword in keywords:
-            if keyword.lower() in doc_lower:
                 keyword_score += 1
-        keyword_score = keyword_score / len(keywords) if keywords else 0
         hybrid_score = 0.7 * vector_score + 0.3 * keyword_score
         hybrid_results.append({
@@ -642,7 +1019,7 @@ def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> D
             f"{GROK_API_BASE}/chat/completions",
             headers=headers,
             json=payload,
-            timeout=30
         )
         if response.status_code != 200:
@@ -759,7 +1136,7 @@ def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
             f"{GROK_API_BASE}/chat/completions",
             headers=headers,
             json=payload,
-            timeout=30
         )
         if response.status_code != 200:
@@ -781,139 +1158,248 @@ def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
 def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
     """
-    문장 단위로 쪼개서 각각 찾은 뒤 모두 하이라이트 (더 공격적)
     """
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     yellow_color = [1.0, 1.0, 0.0]
     def normalize_text(text):
-        return re.sub(r'\s+', ' ', text.strip().lower())
-    def find_text_fuzzy(page, search_text, threshold=0.65):
-        """퍼지 매칭으로 텍스트 영역 찾기 (임계값 낮춤)"""
-        search_norm = normalize_text(search_text)
-        # 1. 정확한 매칭 시도
-        variations = [
-            search_text,
-            search_text.replace(' ', ''),
-            search_text.replace('\n', ' '),
-            search_text.replace(',', ''),
-        ]
-        for var in variations:
-            instances = page.search_for(var)
-            if instances:
-                return instances
-        # 2. 블록 단위 퍼지 매칭
-        blocks = page.get_text("blocks")
         for block in blocks:
-            if len(block) < 5:
                 continue
-            block_text = block[4]
-            block_norm = normalize_text(block_text)
-            similarity = SequenceMatcher(None, search_norm, block_norm).ratio()
-            if similarity >= threshold:
-                return [fitz.Rect(block[0], block[1], block[2], block[3])]
-        # 3. 단어 단위 매칭
-        words = page.get_text("words")
-        if not words:
             return []
-        search_words = search_norm.split()
-        min_words = max(2, len(search_words) // 3)  # 1/3만 매칭되어도 OK
-        best_match = None
-        best_sim = 0.0
-        for i in range(len(words)):
-            for size in range(len(search_words), min_words - 1, -1):
-                if i + size > len(words):
-                    continue
-                window = words[i:i + size]
-                window_text = " ".join([w[4] for w in window])
-                window_norm = normalize_text(window_text)
-                sim = SequenceMatcher(None, search_norm, window_norm).ratio()
-                if sim > best_sim and sim >= threshold:
-                    best_sim = sim
-                    rect = fitz.Rect(window[0][:4])
-                    for w in window[1:]:
-                        rect = rect | fitz.Rect(w[:4])
-                    best_match = rect
-        if best_match:
-            return [best_match]
         return []
-    for item in highlight_info:
         page_num = item['page'] - 1
-        full_text = item['text'].strip()
         if page_num >= len(doc):
             continue
         page = doc[page_num]
-        # 전략 1: 마침표 기준으로 분리 (쉼표 무시)
-        sentences = re.split(r'([.。]\s*)', full_text)
-        sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
-        # 마침표를 앞 문장에 붙이기
-        combined = []
-        i = 0
-        while i < len(sentences):
-            if i + 1 < len(sentences) and sentences[i+1] in ['.', '。']:
-                combined.append(sentences[i] + sentences[i+1])
-                i += 2
-            else:
-                combined.append(sentences[i])
-                i += 1
-        # 각 문장을 개별적으로 찾기 (8자 이상)
-        found_any = False
-        for sentence in combined:
-            if len(sentence) < 8:  # 10자 → 8자로 낮춤
-                continue
-            rects = find_text_fuzzy(page, sentence, threshold=0.60)  # 0.70 → 0.60
-            if rects:
-                found_any = True
-                for rect in rects:
-                    highlight = page.add_highlight_annot(rect)
-                    highlight.set_colors(stroke=yellow_color)
-                    highlight.update()
-        # 전략 2: 문장별로 안 되면 전체를 더 낮은 임계값으로
-        if not found_any:
-            rects = find_text_fuzzy(page, full_text, threshold=0.50)  # 0.60 → 0.50
-            for rect in rects:
                 highlight = page.add_highlight_annot(rect)
                 highlight.set_colors(stroke=yellow_color)
                 highlight.update()
-        # 전략 3: 그래도 안 되면 핵심 키워드만이라도 찾기
-        if not found_any:
-            # 10자 이상의 명사구 추출
-            keywords = re.findall(r'[가-힣]{10,}', full_text)
-            for kw in keywords[:3]:  # 상위 3개만
-                rects = find_text_fuzzy(page, kw, threshold=0.70)
-                for rect in rects:
-                    highlight = page.add_highlight_annot(rect)
-                    highlight.set_colors(stroke=yellow_color)
-                    highlight.update()
     output_bytes = doc.tobytes()
     doc.close()
     return output_bytes
 def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
     if "error" in grok_result:
         return []
@@ -931,29 +1417,149 @@ def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
     return highlights
 def extract_highlights_from_answer(answer: str) -> List[Dict]:
     """
-    답변에서 큰따옴표로 감싼 원문만 추출하여 하이라이트
-    패턴: "PDF 원문" [페이지 X]
     """
-    import re
     highlights = []
-    # 큰따옴표로 감싼 인용구 추출
-    # 예: "제안업체는... 제안하여야 함" [페이지 9]
-    pattern = r'"([^"]+)"\s*\[페이지\s+(\d+)\]'
-    matches = re.findall(pattern, answer)
-    for quote, quote_page in matches:
-        highlights.append({
-            'text': quote.strip(),
-            'page': int(quote_page)
-        })
     return highlights
 def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0):
     highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
     doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
@@ -974,7 +1580,7 @@ def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoo
         if (page_num + 1) in highlighted_pages:
             pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">⭐ 페이지 {page_num + 1}</div>'
         else:
-            pdf_html += f'<div style="background: #667eea; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;">📄 페이지 {page_num + 1}</div>'
         pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />'
         pdf_html += '</div>'
@@ -990,11 +1596,12 @@ def main():
     if not st.session_state.processed:
         col1, col2, col3 = st.columns([1, 1, 1])
         with col2:
-            st.image("img/plobin.svg", use_container_width=True)
             st.text(' ')
     with st.sidebar:
-        st.image("img/plobin.svg", width=120)
         uploaded_file = st.file_uploader(
             "드래그하여 파일을 업로드 또는 클릭하여 선택하세요.",
@@ -1005,8 +1612,8 @@ def main():
         if uploaded_file:
             if st.button("문서 처리 시작", type="primary", use_container_width=True):
-                if not GROK_API_KEY:
-                    st.error("⚠️ GROK_API_KEY가 .env 파일에 설정되지 않았습니다!")
                     st.stop()
                 st.session_state.vector_db = None
@@ -1014,11 +1621,11 @@ def main():
                 st.session_state.chat_history = []
                 st.session_state.current_highlights = []
-                with st.spinner("문서 처리 중..."):
                     try:
                         chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
-                        with st.spinner("문서를 AI가 이해할 수 있게 처리 중.."):
                             collection, embedder = create_vector_db(chunks, metadata_list)
                         st.session_state.vector_db = collection
@@ -1032,7 +1639,14 @@ def main():
                             "pages": len(set(m['page'] for m in metadata_list))
                         }
-                        st.success("문서 처리 완료!")
                         st.rerun()
                     except Exception as e:
@@ -1043,36 +1657,36 @@ def main():
             st.info(f"**{st.session_state.doc_metadata['filename']}**")
             st.info(f"페이지: {st.session_state.doc_metadata['pages']}")
-    if not st.session_state.processed:
-        st.markdown("""
-        <div class="usage-guide">
-            <h2 style="text-align: center; color: #2D3748; margin-bottom: 1.5rem;">사용 방법</h2>
-            <div class="guide-step">
-                <div class="step-number">1</div>
-                <div>PDF 파일을 올려주세요</div>
-            </div>
-            <div class="guide-step">
-                <div class="step-number">2</div>
-                <div>문서 처리가 완료될 때까지 잠시만 기다려주세요</div>
-            </div>
-            <div class="guide-step">
-                <div class="step-number">3</div>
-                <div>문서 내 궁금한 내용을 물어보세요</div>
-            </div>
-            <div class="guide-step">
-                <div class="step-number">4</div>
-                <div>AI가 정확한 답변과 출처를 함께 알려드려요</div>
-            </div>
-        </div>
-        """, unsafe_allow_html=True)
-    else:
         col1, col2 = st.columns([1, 1])
         with col1:
             header_cols = st.columns([7, 1, 1.5, 1])
             with header_cols[0]:
-                st.markdown("### 문서 뷰어")
             if st.session_state.pdf_bytes:
                 pdf_html = render_pdf_with_highlights(
@@ -1105,7 +1719,7 @@ def main():
                     st.session_state.scroll_to_page = None
         with col2:
-            st.markdown('### PLOBIN CHAT', unsafe_allow_html=True)
             chat_container = st.container(height=650)
@@ -1113,41 +1727,20 @@ def main():
                 for msg_idx, msg in enumerate(st.session_state.chat_history):
                     with st.chat_message(msg["role"]):
                         st.markdown(msg["content"])
-                        if msg["role"] == "assistant" and "sources" in msg:
-                            with st.expander("📚 참조 문서"):
-                                for idx, (doc, meta) in enumerate(zip(msg["sources"]["docs"], msg["sources"]["metas"])):
-                                    clean_text = doc[:150] + ('...' if len(doc) > 150 else '')
-                                    if st.button(
-                                        f"페이지 {meta['page']}",
-                                        key=f"goto_source_msg{msg_idx}_{meta['page']}_{idx}",
-                                        use_container_width=True,
-                                        type="secondary"
-                                    ):
-                                        st.session_state.scroll_to_page = meta['page']
-                                        st.rerun()
-                                    st.markdown(f"""
-                                    <div style="background: #F1F5F9; padding: 0.8rem; border-radius: 0.5rem; margin-bottom: 1rem; border-left: 3px solid #667eea;">
-                                        <div style="font-size: 0.9rem; color: #475569;">
-                                            {clean_text}
-                                        </div>
-                                    </div>
-                                    """, unsafe_allow_html=True)
-            prompt = st.chat_input("💬 질문을 입력하세요...", key="chat_input")
             if prompt:
                 st.session_state.chat_history.append({"role": "user", "content": prompt})
                 st.session_state.processing_query = prompt
                 st.rerun()
             if st.session_state.processing_query:
                 query = st.session_state.processing_query
                 st.session_state.processing_query = None
-                with st.spinner("PLOBIN이 검색중입니다..."):
                     try:
                         search_results = hybrid_search(
                             query,
@@ -1168,7 +1761,16 @@ def main():
                             GROK_API_KEY
                         )
                         highlights = extract_highlights_from_answer(answer)
                         st.session_state.current_highlights = highlights
                         if grok_result and "page" in grok_result and "error" not in grok_result:
@@ -1176,14 +1778,7 @@ def main():
                         chat_data = {
                             "role": "assistant",
-                            "content": answer,
-                            "sources": {
-                                "docs": search_results['documents'][0],
-                                "metas": search_results['metadatas'][0],
-                                "scores": search_results.get('scores', []),
-                                "keywords": search_results.get('keywords', []),
-                                "grok_verified": grok_result
-                            }
                         }
                         st.session_state.chat_history.append(chat_data)
                         st.rerun()
@@ -1198,4 +1793,4 @@ def main():
 if __name__ == "__main__":
-    main()

 """
 PLOBIN
 """
+import difflib
 import streamlit as st
 import streamlit.components.v1 as components
 import fitz  # PyMuPDF
 from dotenv import load_dotenv
 import json
 from difflib import SequenceMatcher
+import pdfplumber
 def get_svg_content(svg_path):
     with open(svg_path, "r", encoding="utf-8") as f:
 GROK_API_KEY = os.getenv("GROK_API_KEY")
 GROK_API_BASE = "https://api.x.ai/v1"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+OPENAI_API_BASE = "https://api.openai.com/v1"
 CHROMA_DIR = "./chroma_db"
 EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
 <style>
     [data-testid="stSidebar"] {
         background: linear-gradient(180deg,
+            #f9f9f9 0%,
+            #f9f9f9 100%);
+        box-shadow: none;
+        border-right: 1px solid #ededed;
+        width: 280px !important;
     }
     [data-testid="stSidebar"] h1 {
         color: white !important;
         animation: sidebarTitlePulse 4s ease-in-out infinite;
         letter-spacing: 2px;
     }
     @keyframes sidebarTitlePulse {
         0%, 100% {
             transform: scale(1);
     }
     [data-testid="stSidebar"] [data-testid="stFileUploader"] {
+        background: rgba(198,198,198,0.15);
         border-radius: 15px;
         padding: 1.5rem;
+        border: 1.5px dashed rgba(198,198,198,0.4);
         transition: all 0.3s ease;
         backdrop-filter: blur(10px);
     }
     [data-testid="stFileUploader"] > section {
         background: transparent !important;
     }
     [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] {
+        color: #c6c6c6;
     }
+    /* 사용자 메시지 아이콘 변경 */
+    [data-testid="stChatMessage"][data-testid="user"]
+        [data-testid="chat-message-avatar"] img {
+            content: url("https://your-image-url.com/user-icon.png") !important;
+    }
     [data-testid="stSidebar"] [data-testid="stFileUploader"] > section,
     [data-testid="stSidebar"] [data-testid="stFileUploader"] section > div {
         background: transparent !important;
     }
     [data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p {
+        color: #555555 !important;
     }
     [data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] {
+        background: rgba(127,128,134,0.2) !important;
+        color: #8A8A8A !important;
+        border: 1px solid rgba(127,128,134,0.3) !important;
     }
     [data-testid="stSidebar"] .stButton button {
+        background: rgba(127,128,134,0.15) !important;
+        color: #555555 !important;
+        border: 2px solid rgba(127,128,134,0.4) !important;
         border-radius: 12px !important;
         font-weight: 700 !important;
         padding: 0.75rem 1.5rem !important;
     }
     [data-testid="stSidebar"] .stButton button:hover {
+        background: rgba(255, 36, 36,0.25) !important;
+        border-color: rgba(255, 36, 36,0.6) !important;
         transform: translateY(-2px) scale(1.02) !important;
     }
     [data-testid="stSidebar"] .stButton button:active {
     }
     [data-testid="stSidebar"] [data-testid="stAlert"] {
+        background-color: #f2f2f2 !important;
         border-radius: 0.5rem !important;
     }
+    }
+    [data-testid="stSidebar"] [data-testid="stFileUploader"] button {
+        display: block;
+    }
+    /* 사이드바 접기/펼치기 버튼 항상 보이게 */
+    [data-testid="stSidebarCollapseButton"] {
+        opacity: 1 !important;
+        visibility: visible !important;
+        transition: opacity 0.2s ease !important;
+    }
+    /* 호버 효과 없애고 항상 밝게 */
+    [data-testid="stSidebarCollapseButton"]:hover {
+        opacity: 1 !important;
+    }
     [data-testid="stAlert"] p {
+        color: #747474;
+    }
+    /* 사이드바 Alert 전체 박스 강제 스타일 */
+    [data-testid="stSidebar"] [data-testid="stAlert"] {
+        background-color: #f2f2f2 !important;   /* 원하는 배경색 */
+        border-radius: 0.5rem !important;
+    }
+    /* Alert 내부 컨테이너까지 강제로 색상 적용 */
+    [data-testid="stSidebar"] [data-testid="stAlert"] > div {
+        background-color: #f2f2f2 !important;
     }
+    /* 가장 안쪽 Alert message 박스 */
+    [data-testid="stSidebar"] [data-testid="stAlert"] [role="alert"] {
+        background-color: #f2f2f2 !important;
+    }
     .main .block-container {
         max-width: 100%;
         box-shadow: 0 4px 8px rgba(234, 179, 8, 0.3) !important;
         background: linear-gradient(135deg, #FDE047 0%, #FACC15 100%) !important;
     }
+    /* 채팅 입력창 포커스 시 테두리 색상 변경 */
+    [data-testid="stChatInput"] textarea:focus {
+        border-color: #3f3f3f !important;
+        box-shadow: 0 0 0 1px #3f3f3f !important;
+    }
+    /* 채팅 입력창 기본 상태 */
+    [data-testid="stChatInput"] textarea {
+        border-color: #3f3f3f; !important;
+        transition: border-color 0.2s ease;
+    }
+    /* 호버 상태 */
+    [data-testid="stChatInput"] textarea:hover {
+        border-color: #3f3f3f; !important;
+    }
+    /* Streamlit 기본 아바타 숨기기 */
+    [data-testid="stChatMessage"][data-testid="user"]
+    [data-testid="chat-message-avatar"] img {
+        display: none !important;
+    }
+    /* 원하는 아이콘으로 교체 */
+    [data-testid="stChatMessage"][data-testid="user"]
+    [data-testid="chat-message-avatar"] {
+        background-image: "final/img/user-profile.png";
+        background-size: cover;
+        background-position: center;
+        width: 36px !important;
+        height: 36px !important;
+        border-radius: 50%;   /* 원형 */
+    }
+    /* 기본 아바타 제거 */
+    [data-testid="stChatMessage"][data-testid="assistant"]
+    [data-testid="chat-message-avatar"] img {
+        display: none !important;
+    }
+    /* 커스텀 아이콘 지정 */
+    [data-testid="stChatMessage"][data-testid="assistant"]
+    [data-testid="chat-message-avatar"] {
+        background-image: "final/img/cloud.png;
+        background-size: cover;
+        background-position: center;
+        width: 36px !important;
+        height: 36px !important;
+        border-radius: 50%;
+    }
 </style>
 """, unsafe_allow_html=True)
+SPACE_RE = re.compile(r'\s+')
+def normalize_for_search(text: str) -> str:
+    """
+    검색/매칭용 텍스트 정규화:
+    - 양끝 공백 제거
+    - 소문자 변환
+    - 모든 공백 문자 제거 (띄어쓰기 차이 무시)
+    """
+    text = text.strip().lower()
+    text = SPACE_RE.sub('', text)   # 모든 공백 날리기
+    return text
 def init_session():
     if 'processed' not in st.session_state:
         st.session_state.scroll_to_page = None
+def extract_table_image_as_base64(pdf_bytes: bytes, page_num: int, bbox: tuple) -> str:
+    """
+    PDF 페이지에서 표 영역을 이미지로 추출하여 base64로 인코딩
+    Args:
+        pdf_bytes: PDF 바이트 데이터
+        page_num: 페이지 번호 (0부터 시작)
+        bbox: (x0, y0, x1, y1) 표 영역 좌표
+    Returns:
+        base64 인코딩된 이미지 문자열
+    """
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    page = doc[page_num]
+    # bbox 영역을 이미지로 렌더링 (고해상도)
+    rect = fitz.Rect(bbox)
+    pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0), clip=rect)
+    img_bytes = pix.tobytes("png")
+    doc.close()
+    # base64 인코딩
+    img_base64 = base64.b64encode(img_bytes).decode('utf-8')
+    return img_base64
+def convert_table_to_markdown_with_vision(
+    pdf_bytes: bytes,
+    page_num: int,
+    bbox: tuple,
+    api_key: str
+) -> str:
+    """
+    OpenAI Vision API를 사용하여 표 이미지를 마크다운으로 변환
+    Args:
+        pdf_bytes: PDF 바이트 데이터
+        page_num: 페이지 번호
+        bbox: 표 영역 좌표
+        api_key: OpenAI API 키
+    Returns:
+        마크다운 형식의 표
+    """
+    # 표 영역 이미지 추출
+    img_base64 = extract_table_image_as_base64(pdf_bytes, page_num, bbox)
+    # OpenAI Vision API 호출
+    prompt = """이 이미지는 PDF 문서의 표입니다.
+표의 내용을 정확하게 마크다운 표 형식으로 변환해주세요.
+규칙:
+1. 셀 병합이 있으면 적절히 처리
+2. 중첩된 표가 있으면 텍스트로 표현
+3. 빈 셀은 빈 칸으로 유지
+4. 표 형식만 반환 (추가 설명 없이)
+마크다운 표 형식:
+| 열1 | 열2 | 열3 |
+| --- | --- | --- |
+| 데이터1 | 데이터2 | 데이터3 |"""
+    try:
+        response = requests.post(
+            f"{OPENAI_API_BASE}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": "gpt-4o",  # gpt-4o 또는 gpt-4o-mini
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{img_base64}",
+                                    "detail": "high"  # "low", "high", "auto"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                "temperature": 0.1,
+                "max_tokens": 2000
+            },
+            timeout=120
+        )
+        if response.status_code == 200:
+            result = response.json()
+            markdown_table = result['choices'][0]['message']['content']
+            # 코드블록 제거
+            markdown_table = re.sub(r'```markdown\s*|\s*```', '', markdown_table)
+            markdown_table = re.sub(r'```\s*|\s*```', '', markdown_table)
+            return markdown_table.strip()
+        else:
+            # 에러 상세 출력
+            error_detail = response.text
+            print(f"OpenAI API 오류: {response.status_code}")
+            print(f"상세: {error_detail}")
+            return f"[표 변환 실패: {response.status_code} - {error_detail[:200]}]"
+    except Exception as e:
+        return f"[표 변환 실패: {str(e)}]"
 def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
+    """
+    PDF에서 텍스트와 표를 추출 (표는 Grok Vision API로 처리)
+    """
     pdf_bytes = pdf_file.read()
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     CHUNK_SIZE = 800
     OVERLAP_SIZE = 150
+    # pdfplumber로 PDF 열기
+    pdf_file.seek(0)
+    with pdfplumber.open(pdf_file) as pdf_plumber:
+        for page_num in range(len(doc)):
+            # PyMuPDF로 텍스트 추출
+            fitz_page = doc[page_num]
+            text = fitz_page.get_text("text")
+            # pdfplumber로 표 탐지
+            tables_markdown = []
+            if page_num < len(pdf_plumber.pages):
+                plumber_page = pdf_plumber.pages[page_num]
+                # 표 탐지
+                table_settings = {
+                    "vertical_strategy": "lines",
+                    "horizontal_strategy": "lines",
+                    "snap_tolerance": 3,
+                    "join_tolerance": 3,
+                }
+                tables = plumber_page.find_tables(table_settings=table_settings)
+                # 각 표를 Vision API로 처리
+                for idx, table in enumerate(tables):
+                    bbox = table.bbox  # (x0, y0, x1, y1)
+                    # Grok Vision API로 마크다운 변환
+                    markdown_table = convert_table_to_markdown_with_vision(
+                        pdf_bytes,
+                        page_num,
+                        bbox,
+                        OPENAI_API_KEY
+                    )
+                    tables_markdown.append(f"\n\n**[표 {idx + 1}]**\n{markdown_table}\n")
+            # 텍스트와 표를 결합
+            combined_content = text
+            if tables_markdown:
+                combined_content += "\n\n" + "\n".join(tables_markdown)
+            pages_text[page_num + 1] = combined_content
+            if not combined_content.strip():
+                continue
+            # 청크로 분할
+            lines = [line.strip() for line in combined_content.split('\n') if line.strip()]
+            cleaned_text = '\n'.join(lines)
+            # 표 마커를 기준으로 분할 우선 처리
+            if "**[표" in cleaned_text:
+                # 표 단위로 분할
+                table_pattern = r'\*\*\[표 \d+\]\*\*'
+                parts = re.split(f'({table_pattern})', cleaned_text)
+                current_chunk = ""
+                for part in parts:
+                    part = part.strip()
+                    if not part:
+                        continue
+                    # 표 섹션인 경우
+                    if re.match(table_pattern, part):
+                        if current_chunk:
+                            chunks.append(current_chunk.strip())
+                            metadata_list.append({
+                                "page": page_num + 1,
+                                "source": pdf_file.name,
+                                "chunk_type": "text"
+                            })
+                            current_chunk = ""
+                        current_chunk = part
+                    else:
+                        # 표 내용이거나 일반 텍스트
+                        if current_chunk and re.match(table_pattern, current_chunk):
+                            # 이전이 표 마커였다면 표 내용 추가
+                            current_chunk += "\n" + part
+                            chunks.append(current_chunk.strip())
+                            metadata_list.append({
+                                "page": page_num + 1,
+                                "source": pdf_file.name,
+                                "chunk_type": "table"
+                            })
+                            current_chunk = ""
+                        else:
+                            # 일반 텍스트 처리
+                            if len(current_chunk) + len(part) > CHUNK_SIZE:
+                                if current_chunk:
+                                    chunks.append(current_chunk.strip())
+                                    metadata_list.append({
+                                        "page": page_num + 1,
+                                        "source": pdf_file.name,
+                                        "chunk_type": "text"
+                                    })
+                                current_chunk = part
+                            else:
+                                current_chunk += "\n" + part if current_chunk else part
+                if current_chunk:
+                    chunk_type = "table" if re.match(table_pattern, current_chunk) else "text"
+                    chunks.append(current_chunk.strip())
+                    metadata_list.append({
+                        "page": page_num + 1,
+                        "source": pdf_file.name,
+                        "chunk_type": chunk_type
+                    })
             else:
+                # 표가 없는 경우 일반 텍스트 처리
+                sentences = re.split(r'([.!?]\s+|\n{2,})', cleaned_text)
+                sentences = [s for s in sentences if s.strip()]
+                current_chunk = ""
+                current_length = 0
+                for sentence in sentences:
+                    sentence_length = len(sentence)
+                    if current_length + sentence_length > CHUNK_SIZE and current_chunk:
+                        chunks.append(current_chunk.strip())
+                        metadata_list.append({
+                            "page": page_num + 1,
+                            "source": pdf_file.name,
+                            "chunk_type": "text"
+                        })
+                        overlap_text = current_chunk[-OVERLAP_SIZE:] if len(current_chunk) > OVERLAP_SIZE else current_chunk
+                        current_chunk = overlap_text + sentence
+                        current_length = len(current_chunk)
+                    else:
+                        current_chunk += sentence
+                        current_length += sentence_length
+                if current_chunk.strip():
+                    chunks.append(current_chunk.strip())
+                    metadata_list.append({
+                        "page": page_num + 1,
+                        "source": pdf_file.name,
+                        "chunk_type": "text"
+                    })
     doc.close()
     return chunks, metadata_list, pdf_bytes, pages_text
+def save_extracted_text_to_file(chunks: List[str], metadata_list: List[Dict], filename: str):
+    """
+    추출한 텍스트를 로컬 파일로 저장
+    """
+    import os
+    from datetime import datetime
+    # 저장 디렉토리 생성
+    output_dir = "extracted_text"
+    os.makedirs(output_dir, exist_ok=True)
+    # 파일명 생성 (타임스탬프 포함)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_name = os.path.splitext(filename)[0]
+    output_file = os.path.join(output_dir, f"{base_name}_{timestamp}.txt")
+    # 텍스트 저장
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(f"=" * 80 + "\n")
+        f.write(f"문서명: {filename}\n")
+        f.write(f"추출 시간: {timestamp}\n")
+        f.write(f"총 청크 수: {len(chunks)}\n")
+        f.write(f"=" * 80 + "\n\n")
+        for idx, (chunk, meta) in enumerate(zip(chunks, metadata_list), 1):
+            f.write(f"\n{'='*80}\n")
+            f.write(f"청크 #{idx}\n")
+            f.write(f"페이지: {meta.get('page', 'N/A')}\n")
+            f.write(f"타입: {meta.get('chunk_type', 'text')}\n")
+            f.write(f"{'-'*80}\n")
+            f.write(chunk)
+            f.write(f"\n{'='*80}\n")
+    return output_file
+@st.cache_resource(show_spinner=False)
 def load_embedding_model():
     return SentenceTransformer(EMBEDDING_MODEL)
         vector_score = 1 - vector_results['distances'][0][i]
         keyword_score = 0
+        # 원문/정규화 둘 다 준비
         doc_lower = doc.lower()
+        doc_norm = normalize_for_search(doc)  # 공백 제거 버전
         for keyword in keywords:
+            kw_lower = keyword.lower()
+            kw_norm = normalize_for_search(keyword)
+            # 1) 원래 방식: 그대로 포함 여부
+            # 2) 공백 제거 버전: 붙어 있거나 이상하게 띄어져도 매칭 가능
+            if kw_lower in doc_lower or kw_norm in doc_norm:
                 keyword_score += 1
+        keyword_score = keyword_score / len(keywords) if keywords else 0
         hybrid_score = 0.7 * vector_score + 0.3 * keyword_score
         hybrid_results.append({
             f"{GROK_API_BASE}/chat/completions",
             headers=headers,
             json=payload,
+            timeout=120
         )
         if response.status_code != 200:
             f"{GROK_API_BASE}/chat/completions",
             headers=headers,
             json=payload,
+            timeout=120
         )
         if response.status_code != 200:
 def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
     """
+    PyMuPDF 기반의 하이라이트 함수 - 전체 우선, 실패시에만 분할
     """
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     yellow_color = [1.0, 1.0, 0.0]
     def normalize_text(text):
+        """텍스트 정규화"""
+        return re.sub(r'\s+', ' ', text.strip())
+    def merge_rects(rects, threshold=5):
+        """겹치거나 인접한 사각형들을 병합"""
+        if not rects:
+            return []
+        # 사각형들을 y좌표로 정렬
+        sorted_rects = sorted(rects, key=lambda r: (r.y0, r.x0))
+        merged = [sorted_rects[0]]
+        for rect in sorted_rects[1:]:
+            last = merged[-1]
+            # 같은 라인이고 x가 겹치거나 인접하면 병합
+            if abs(rect.y0 - last.y0) < threshold:
+                if rect.x0 <= last.x1 + threshold:
+                    merged[-1] = fitz.Rect(
+                        min(last.x0, rect.x0),
+                        min(last.y0, rect.y0),
+                        max(last.x1, rect.x1),
+                        max(last.y1, rect.y1)
+                    )
+                else:
+                    merged.append(rect)
+            # 다른 라인이지만 y가 연속되면 (줄바꿈)
+            elif rect.y0 <= last.y1 + 20:
+                merged.append(rect)
+            else:
+                merged.append(rect)
+        return merged
+    def find_text_across_lines(page, search_text):
+        """줄바꿈을 넘어서 텍스트 찾기 - 공백 무시 비교"""
+        found_rects = []
+        # 페이지 텍스트 구조 가져오기
+        blocks = page.get_text("dict")["blocks"]
+        # 모든 라인의 텍스트와 bbox를 수집
+        lines_info = []  # [(text, bbox), ...]
         for block in blocks:
+            if "lines" not in block:
                 continue
+            for line in block["lines"]:
+                line_text = ""
+                for span in line["spans"]:
+                    line_text += span["text"]
+                if line_text.strip():
+                    lines_info.append((line_text, fitz.Rect(line["bbox"])))
+        if not lines_info:
             return []
+        # 검색어 정규화 - 공백 완전 제거 버전
+        search_no_space = search_text.lower().replace(" ", "").replace("\n", "")
+        # 연속된 라인들을 합쳐서 검색
+        for start_idx in range(len(lines_info)):
+            combined_text = ""
+            combined_bboxes = []
+            for end_idx in range(start_idx, min(start_idx + 5, len(lines_info))):  # 최대 5줄
+                line_text, line_bbox = lines_info[end_idx]
+                combined_text += line_text
+                combined_bboxes.append(line_bbox)
+                # 공백 제거 후 비교 (핵심!)
+                combined_no_space = combined_text.lower().replace(" ", "").replace("\n", "")
+                # 검색어가 포함되어 있는지 확인
+                if search_no_space in combined_no_space:
+                    # 매칭됨 - 해당 라인들의 bbox 반환
+                    for bbox in combined_bboxes:
+                        found_rects.append(bbox)
+                    print(f"    ✅ 라인 매칭 ({start_idx+1}~{end_idx+1}줄): {len(combined_bboxes)}개 영역")
+                    return merge_rects(found_rects)
         return []
+    def find_text_with_pymupdf(page, search_text):
+        """PyMuPDF로 텍스트 찾기 - 정확하고 깔끔하게"""
+        found_rects = []
+        search_text = search_text.strip()
+        print(f"    검색 중...")
+        # === 우선순위 1: PyMuPDF 기본 검색 ===
+        instances = page.search_for(search_text)
+        if instances:
+            print(f"    ✅ 성공 [원본]: {len(instances)}개")
+            return merge_rects(instances)
+        # === 우선순위 2: 정규화 후 검색 ===
+        normalized = normalize_text(search_text)
+        if normalized != search_text:
+            instances = page.search_for(normalized)
+            if instances:
+                print(f"    ✅ 성공 [정규화]: {len(instances)}개")
+                return merge_rects(instances)
+        # === 우선순위 3: 줄바꿈 넘어서 검색 (라인 매칭) ===
+        line_results = find_text_across_lines(page, search_text)
+        if line_results:
+            return line_results
+        print(f"    ⚠️ 라인 매칭 실패 → 핵심 구문")
+        # === 우선순위 4: 핵심 구문만 검색 (처음 30자 + 마지막 20자) ===
+        if len(search_text) > 50:
+            # 앞부분
+            front = search_text[:30]
+            front_inst = page.search_for(front)
+            if front_inst:
+                print(f"    ✅ 앞부분 매칭: {front[:20]}...")
+                found_rects.extend(front_inst[:1])  # 첫 번째만
+            # 뒷부분
+            back = search_text[-20:]
+            back_inst = page.search_for(back)
+            if back_inst:
+                print(f"    ✅ 뒷부분 매칭: ...{back[:15]}")
+                found_rects.extend(back_inst[:1])  # 첫 번째만
+        if found_rects:
+            return merge_rects(found_rects)
+        print(f"    ⚠️ 핵심 구문 실패 → 키워드")
+        # === 우선순위 5: 키워드 (최대 2개만) ===
+        keywords = re.findall(r'[가-힣]{10,}', search_text)
+        if not keywords:
+            keywords = re.findall(r'[가-힣]{7,}', search_text)
+        if keywords:
+            for kw in keywords[:2]:  # 최대 2개만
+                inst = page.search_for(kw)
+                if inst:
+                    print(f"    ✅ 키워드: {kw}")
+                    found_rects.extend(inst[:1])  # 첫 번째만
+        if found_rects:
+            return merge_rects(found_rects)
+        # === 우선순위 6: 블록 ===
+        print(f"    최후: 블록")
+        blocks = page.get_text("dict")["blocks"]
+        search_norm = normalize_text(search_text.lower())
+        for block in blocks:
+            if "lines" not in block:
+                continue
+            block_text = ""
+            for line in block["lines"]:
+                for span in line["spans"]:
+                    block_text += span["text"] + " "
+            block_norm = normalize_text(block_text.lower())
+            if search_norm in block_norm:
+                found_rects.append(fitz.Rect(block["bbox"]))
+                print(f"    ✅ 블록 일치")
+                break
+        return merge_rects(found_rects) if found_rects else []
+    print(f"\n{'='*80}")
+    print(f"하이라이트 시작 - 총 {len(highlight_info)}개 항목")
+    print(f"{'='*80}")
+    total_success = 0
+    total_failed = 0
+    for idx, item in enumerate(highlight_info, 1):
         page_num = item['page'] - 1
+        text_to_highlight = item['text'].strip()
         if page_num >= len(doc):
+            print(f"\n[{idx}] ❌ 페이지 오류: {page_num + 1}")
+            total_failed += 1
             continue
         page = doc[page_num]
+        print(f"\n[{idx}/{len(highlight_info)}]")
+        print(f"  📄 페이지: {page_num + 1}")
+        print(f"  📝 길이: {len(text_to_highlight)}자")
+        print(f"  💬 내용: {text_to_highlight[:70]}...")
+        # 텍스트 찾기
+        found_rects = find_text_with_pymupdf(page, text_to_highlight)
+        # 중복 제거 (같은 위치의 사각형)
+        unique_rects = []
+        for rect in found_rects:
+            is_duplicate = False
+            for existing in unique_rects:
+                # 좌표가 거의 같으면 중복으로 간주
+                if (abs(rect.x0 - existing.x0) < 5 and
+                    abs(rect.y0 - existing.y0) < 5 and
+                    abs(rect.x1 - existing.x1) < 5 and
+                    abs(rect.y1 - existing.y1) < 5):
+                    is_duplicate = True
+                    break
+            if not is_duplicate:
+                unique_rects.append(rect)
+        # 하이라이트 추가
+        highlighted_count = 0
+        for rect in unique_rects:
+            try:
                 highlight = page.add_highlight_annot(rect)
                 highlight.set_colors(stroke=yellow_color)
                 highlight.update()
+                highlighted_count += 1
+            except Exception as e:
+                print(f"    ✗ 하이라이트 실패: {e}")
+        if highlighted_count > 0:
+            print(f"  ✅ 완료: {highlighted_count}개 영역")
+            total_success += 1
+        else:
+            print(f"  ❌ 실패: 텍스트를 찾을 수 없음")
+            total_failed += 1
+    print(f"\n{'='*80}")
+    print(f"📊 최종 결과: ✅ 성공 {total_success}개 / ❌ 실패 {total_failed}개")
+    print(f"{'='*80}\n")
     output_bytes = doc.tobytes()
     doc.close()
     return output_bytes
 def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
     if "error" in grok_result:
         return []
     return highlights
 def extract_highlights_from_answer(answer: str) -> List[Dict]:
     """
+    답변에서 하이라이트할 텍스트 추출
+    [페이지 X] 앞뒤 모두 해당 페이지로 간주
     """
     highlights = []
+    print(f"\n{'='*80}")
+    print(f"답변 텍스트 분석 중...")
+    print(f"{'='*80}\n")
+    # [페이지 X] 패턴 찾기
+    page_pattern = r'\[\s*페이지\s*(\d+)\s*\]'
+    page_matches = list(re.finditer(page_pattern, answer))
+    print(f"📍 [페이지] 태그 {len(page_matches)}개 발견\n")
+    quoted_matches = []
+    list_matches = []
+    # 각 [페이지 X]에 대해 앞뒤 섹션 분석
+    for i, match in enumerate(page_matches):
+        page_num = match.group(1)
+        tag_start = match.start()
+        tag_end = match.end()
+        # === 섹션 1: [페이지 X] 앞 부분 (같은 단락 내) ===
+        # 이전 [페이지] 또는 줄바꿈 2개까지
+        section_start = 0
+        if i > 0:
+            section_start = page_matches[i-1].end()
+        # [페이지 X] 앞의 같은 단락 (줄바꿈 2개 전까지)
+        before_section = answer[section_start:tag_start]
+        # 마지막 불릿 포인트나 인용문 찾기
+        last_para_match = re.search(r'([-*○]\s+.+)$', before_section, re.DOTALL)
+        if last_para_match:
+            before_text = last_para_match.group(1)
+            print(f"--- 페이지 {page_num} 앞부분 (길이: {len(before_text)}자) ---")
+            print(f"{before_text[:150]}...\n")
+            # 큰따옴표 인용문 추출
+            quotes = re.findall(r'"([^"]+)"', before_text)
+            for quote in quotes:
+                quote_clean = quote.strip()
+                if len(quote_clean) > 10:
+                    quoted_matches.append((quote_clean, int(page_num)))
+                    print(f"  ✓ [앞-인용문] \"{quote_clean[:60]}...\"")
+        # === 섹션 2: [페이지 X] 뒤 부분 (기존 로직) ===
+        next_page_pos = len(answer)
+        if i + 1 < len(page_matches):
+            next_page_pos = page_matches[i + 1].start()
+        section = answer[tag_end:next_page_pos]
+        print(f"--- 페이지 {page_num} 뒷부분 (길이: {len(section)}자) ---")
+        print(f"{section[:150]}...\n")
+        # 큰따옴표 인용문
+        quotes = re.findall(r'"([^"]+)"', section)
+        for quote in quotes:
+            quote_clean = quote.strip()
+            if len(quote_clean) > 10:
+                quoted_matches.append((quote_clean, int(page_num)))
+                print(f"  ✓ [뒤-인용문] \"{quote_clean[:60]}...\"")
+        # 리스트 항목
+        lines = section.split('\n')
+        for line in lines:
+            line_stripped = line.strip()
+            if len(line_stripped) < 3:
+                continue
+            if line_stripped.startswith('**') or line_stripped.startswith('#'):
+                continue
+            item = None
+            if line_stripped.startswith('○'):
+                item = line_stripped[1:].strip()
+            elif line_stripped.startswith('- ') or line_stripped.startswith('* '):
+                item = line_stripped[2:].strip()
+            elif re.match(r'^\d+\.\s+', line_stripped):
+                match_obj = re.match(r'^\d+\.\s+(.+)$', line_stripped)
+                if match_obj:
+                    item = match_obj.group(1).strip()
+            if item:
+                item = re.sub(r'\[\s*페이지\s*\d+\s*\]', '', item).strip()
+                item = re.sub(r'\*\*([^*]+)\*\*', r'\1', item).strip()
+                item = re.sub(r'\([""""][^)]+[""""\)]+', '', item).strip()
+                item = re.sub(r'\s*\([^)]{0,50}\)\s*$', '', item).strip()
+                if 3 <= len(item) <= 200:
+                    list_matches.append((item, int(page_num)))
+                    print(f"  ✓ [리스트] {item[:50]}...")
+    print(f"\n{'='*40}")
+    print(f"📝 인용문: {len(quoted_matches)}개")
+    print(f"📋 리스트: {len(list_matches)}개")
+    print(f"{'='*40}\n")
+    # 우선순위
+    all_matches = []
+    if quoted_matches and list_matches:
+        all_short = all(len(q[0]) <= 30 for q in quoted_matches)
+        if all_short:
+            print(f"✓ 짧은 인용문 + 리스트 모두")
+            all_matches = quoted_matches + list_matches
+        else:
+            print(f"✓ 인용문만")
+            all_matches = quoted_matches
+    elif quoted_matches:
+        print(f"✓ 인용문만")
+        all_matches = quoted_matches
+    elif list_matches:
+        print(f"✓ 리스트만")
+        all_matches = list_matches
+    # 중복 제거
+    seen = set()
+    for text, page in all_matches:
+        if text and (text, page) not in seen:
+            highlights.append({
+                'text': text,
+                'page': page
+            })
+            seen.add((text, page))
+    print(f"\n{'='*80}")
+    print(f"✅ 최종 추출: {len(highlights)}개")
+    for i, h in enumerate(highlights, 1):
+        print(f"  [{i}] 페이지 {h['page']}: {h['text'][:60]}...")
+    print(f"{'='*80}\n")
     return highlights
 def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0):
     highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
     doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
         if (page_num + 1) in highlighted_pages:
             pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">⭐ 페이지 {page_num + 1}</div>'
         else:
+            pdf_html += f'<div style="background: #ADADAD; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;"> 페이지 {page_num + 1}</div>'
         pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />'
         pdf_html += '</div>'
     if not st.session_state.processed:
         col1, col2, col3 = st.columns([1, 1, 1])
         with col2:
+            st.markdown("<div style='height: 30vh;'></div>", unsafe_allow_html=True)
+            st.image("img/plobin-grey.png", use_container_width=True)
             st.text(' ')
     with st.sidebar:
+        st.image("img/plobin-right-only.png", width=85)
         uploaded_file = st.file_uploader(
             "드래그하여 파일을 업로드 또는 클릭하여 선택하세요.",
         if uploaded_file:
             if st.button("문서 처리 시작", type="primary", use_container_width=True):
+                if not GROK_API_KEY or not OPENAI_API_KEY:
+                    st.error("⚠️ GROK_API_KEY 또는 OPENAI_API_KEY가 .env 파일에 설정되지 않았습니다!")
                     st.stop()
                 st.session_state.vector_db = None
                 st.session_state.chat_history = []
                 st.session_state.current_highlights = []
+                with st.spinner("문서 분석을 시작합니다..."):
                     try:
                         chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
+                        with st.spinner("핵심 내용을 파악하고 있습니다..."):
                             collection, embedder = create_vector_db(chunks, metadata_list)
                         st.session_state.vector_db = collection
                             "pages": len(set(m['page'] for m in metadata_list))
                         }
+                        # 텍스트 로컬 저장
+                        saved_file = save_extracted_text_to_file(
+                            chunks,
+                            metadata_list,
+                            uploaded_file.name
+                        )
+                        st.success(f"문서 처리 완료!")
                         st.rerun()
                     except Exception as e:
             st.info(f"**{st.session_state.doc_metadata['filename']}**")
             st.info(f"페이지: {st.session_state.doc_metadata['pages']}")
+    # if not st.session_state.processed:
+        # st.markdown("""
+        # <div class="usage-guide">
+        #     <h2 style="text-align: center; color: #2D3748; margin-bottom: 1.5rem;">사용 방법</h2>
+        #     <div class="guide-step">
+        #         <div class="step-number">1</div>
+        #         <div>PDF 파일을 올려주세요</div>
+        #     </div>
+        #     <div class="guide-step">
+        #         <div class="step-number">2</div>
+        #         <div>문서 처리가 완료될 때까지 잠시만 기다려주세요</div>
+        #     </div>
+        #     <div class="guide-step">
+        #         <div class="step-number">3</div>
+        #         <div>문서 내 궁금한 내용을 물어보세요</div>
+        #     </div>
+        #     <div class="guide-step">
+        #         <div class="step-number">4</div>
+        #         <div>AI가 정확한 답변과 출처를 함께 알려드려요</div>
+        #     </div>
+        # </div>
+        # """, unsafe_allow_html=True)
+    if st.session_state.processed:
         col1, col2 = st.columns([1, 1])
         with col1:
             header_cols = st.columns([7, 1, 1.5, 1])
             with header_cols[0]:
+                st.markdown("###   ")
             if st.session_state.pdf_bytes:
                 pdf_html = render_pdf_with_highlights(
                     st.session_state.scroll_to_page = None
         with col2:
+            st.markdown('###  ', unsafe_allow_html=True)
             chat_container = st.container(height=650)
                 for msg_idx, msg in enumerate(st.session_state.chat_history):
                     with st.chat_message(msg["role"]):
                         st.markdown(msg["content"])
+            prompt = st.chat_input("질문을 입력하세요...", key="chat_input")
             if prompt:
                 st.session_state.chat_history.append({"role": "user", "content": prompt})
                 st.session_state.processing_query = prompt
                 st.rerun()
+                        # main() 함수 내부의 질문 처리 부분
             if st.session_state.processing_query:
                 query = st.session_state.processing_query
                 st.session_state.processing_query = None
+                with st.spinner("PLOBIN이 최적의 답변을 찾고 있습니다..."):
                     try:
                         search_results = hybrid_search(
                             query,
                             GROK_API_KEY
                         )
+                        # ⭐ 중요: 큰따옴표 안의 텍스트만 추출
+                        print("\n" + "="*80)
+                        print("답변에서 인용문 추출 중...")
+                        print("="*80)
                         highlights = extract_highlights_from_answer(answer)
+                        # grok_result에서 추출한 것은 사용하지 않음 (필요시 주석 해제)
+                        # grok_highlights = extract_highlights_from_grok(grok_result)
+                        # highlights.extend(grok_highlights)
                         st.session_state.current_highlights = highlights
                         if grok_result and "page" in grok_result and "error" not in grok_result:
                         chat_data = {
                             "role": "assistant",
+                            "content": answer
                         }
                         st.session_state.chat_history.append(chat_data)
                         st.rerun()
 if __name__ == "__main__":
+    main()