dohyune commited on
Commit
41800ca
ยท
verified ยท
1 Parent(s): 4571aae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +833 -238
app.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  PLOBIN
3
  """
 
4
  import streamlit as st
5
  import streamlit.components.v1 as components
6
  import fitz # PyMuPDF
@@ -17,6 +18,7 @@ import base64
17
  from dotenv import load_dotenv
18
  import json
19
  from difflib import SequenceMatcher
 
20
 
21
  def get_svg_content(svg_path):
22
  with open(svg_path, "r", encoding="utf-8") as f:
@@ -28,6 +30,8 @@ load_dotenv()
28
 
29
  GROK_API_KEY = os.getenv("GROK_API_KEY")
30
  GROK_API_BASE = "https://api.x.ai/v1"
 
 
31
  CHROMA_DIR = "./chroma_db"
32
  EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
33
 
@@ -46,11 +50,13 @@ st.markdown("""
46
  <style>
47
  [data-testid="stSidebar"] {
48
  background: linear-gradient(180deg,
49
- #618FC2 0%,
50
- #8E969E 100%);
51
- box-shadow: 4px 0 30px rgba(0,0,0,0.2);
52
- width: 290px !important;
 
53
  }
 
54
 
55
  [data-testid="stSidebar"] h1 {
56
  color: white !important;
@@ -62,7 +68,7 @@ st.markdown("""
62
  animation: sidebarTitlePulse 4s ease-in-out infinite;
63
  letter-spacing: 2px;
64
  }
65
-
66
  @keyframes sidebarTitlePulse {
67
  0%, 100% {
68
  transform: scale(1);
@@ -82,13 +88,15 @@ st.markdown("""
82
  }
83
 
84
  [data-testid="stSidebar"] [data-testid="stFileUploader"] {
85
- background: rgba(255,255,255,0.15);
86
  border-radius: 15px;
87
  padding: 1.5rem;
88
- border: 3px dashed rgba(255,255,255,0.4);
89
  transition: all 0.3s ease;
90
  backdrop-filter: blur(10px);
91
  }
 
 
92
 
93
  [data-testid="stFileUploader"] > section {
94
  background: transparent !important;
@@ -99,9 +107,15 @@ st.markdown("""
99
  }
100
 
101
  [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] {
102
- color: #fafafa;
103
  }
104
 
 
 
 
 
 
 
105
  [data-testid="stSidebar"] [data-testid="stFileUploader"] > section,
106
  [data-testid="stSidebar"] [data-testid="stFileUploader"] section > div {
107
  background: transparent !important;
@@ -109,19 +123,19 @@ st.markdown("""
109
  }
110
 
111
  [data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p {
112
- color: rgba(255,255,255,0.9) !important;
113
  }
114
 
115
  [data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] {
116
- background: rgba(255,255,255,0.2) !important;
117
- color: white !important;
118
- border: 1px solid rgba(255,255,255,0.3) !important;
119
  }
120
 
121
  [data-testid="stSidebar"] .stButton button {
122
- background: rgba(255,255,255,0.15) !important;
123
- color: white !important;
124
- border: 2px solid rgba(255,255,255,0.4) !important;
125
  border-radius: 12px !important;
126
  font-weight: 700 !important;
127
  padding: 0.75rem 1.5rem !important;
@@ -131,10 +145,9 @@ st.markdown("""
131
  }
132
 
133
  [data-testid="stSidebar"] .stButton button:hover {
134
- background: rgba(255,255,255,0.25) !important;
135
- border-color: rgba(255,255,255,0.6) !important;
136
  transform: translateY(-2px) scale(1.02) !important;
137
- box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important;
138
  }
139
 
140
  [data-testid="stSidebar"] .stButton button:active {
@@ -153,13 +166,48 @@ st.markdown("""
153
  }
154
 
155
  [data-testid="stSidebar"] [data-testid="stAlert"] {
156
- background-color: rgba(255, 255, 255, 0.001) !important;
157
  border-radius: 0.5rem !important;
158
  }
 
 
 
 
 
 
159
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  [data-testid="stAlert"] p {
161
- color: rgb(250, 250, 250);
 
 
 
 
 
 
 
 
 
 
 
162
  }
 
 
 
 
 
 
163
 
164
  .main .block-container {
165
  max-width: 100%;
@@ -370,9 +418,76 @@ st.markdown("""
370
  box-shadow: 0 4px 8px rgba(234, 179, 8, 0.3) !important;
371
  background: linear-gradient(135deg, #FDE047 0%, #FACC15 100%) !important;
372
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  </style>
374
  """, unsafe_allow_html=True)
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
  def init_session():
378
  if 'processed' not in st.session_state:
@@ -401,7 +516,125 @@ def init_session():
401
  st.session_state.scroll_to_page = None
402
 
403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
 
 
 
405
  pdf_bytes = pdf_file.read()
406
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
407
 
@@ -412,54 +645,189 @@ def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]
412
  CHUNK_SIZE = 800
413
  OVERLAP_SIZE = 150
414
 
415
- for page_num in range(len(doc)):
416
- page = doc[page_num]
417
- text = page.get_text("text")
418
- pages_text[page_num + 1] = text
419
-
420
- if not text.strip():
421
- continue
422
-
423
- lines = [line.strip() for line in text.split('\n') if line.strip()]
424
- cleaned_text = '\n'.join(lines)
425
-
426
- sentences = re.split(r'([.!?]\s+|\n{2,})', cleaned_text)
427
- sentences = [s for s in sentences if s.strip()]
428
-
429
- current_chunk = ""
430
- current_length = 0
431
-
432
- for sentence in sentences:
433
- sentence_length = len(sentence)
434
 
435
- if current_length + sentence_length > CHUNK_SIZE and current_chunk:
436
- chunks.append(current_chunk.strip())
437
- metadata_list.append({
438
- "page": page_num + 1,
439
- "source": pdf_file.name,
440
- "chunk_type": "paragraph"
441
- })
442
 
443
- overlap_text = current_chunk[-OVERLAP_SIZE:] if len(current_chunk) > OVERLAP_SIZE else current_chunk
444
- current_chunk = overlap_text + sentence
445
- current_length = len(current_chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  else:
447
- current_chunk += sentence
448
- current_length += sentence_length
449
-
450
- if current_chunk.strip():
451
- chunks.append(current_chunk.strip())
452
- metadata_list.append({
453
- "page": page_num + 1,
454
- "source": pdf_file.name,
455
- "chunk_type": "paragraph"
456
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
  doc.close()
459
  return chunks, metadata_list, pdf_bytes, pages_text
460
 
461
 
462
- @st.cache_resource
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  def load_embedding_model():
464
  return SentenceTransformer(EMBEDDING_MODEL)
465
 
@@ -555,12 +923,21 @@ def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
555
  vector_score = 1 - vector_results['distances'][0][i]
556
 
557
  keyword_score = 0
 
 
558
  doc_lower = doc.lower()
 
 
559
  for keyword in keywords:
560
- if keyword.lower() in doc_lower:
 
 
 
 
 
561
  keyword_score += 1
562
- keyword_score = keyword_score / len(keywords) if keywords else 0
563
 
 
564
  hybrid_score = 0.7 * vector_score + 0.3 * keyword_score
565
 
566
  hybrid_results.append({
@@ -642,7 +1019,7 @@ def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> D
642
  f"{GROK_API_BASE}/chat/completions",
643
  headers=headers,
644
  json=payload,
645
- timeout=30
646
  )
647
 
648
  if response.status_code != 200:
@@ -759,7 +1136,7 @@ def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
759
  f"{GROK_API_BASE}/chat/completions",
760
  headers=headers,
761
  json=payload,
762
- timeout=30
763
  )
764
 
765
  if response.status_code != 200:
@@ -781,139 +1158,248 @@ def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
781
 
782
  def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
783
  """
784
- ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ์ชผ๊ฐœ์„œ ๊ฐ๊ฐ ์ฐพ์€ ๋’ค ๋ชจ๋‘ ํ•˜์ด๋ผ์ดํŠธ (๋” ๊ณต๊ฒฉ์ )
785
  """
786
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
787
  yellow_color = [1.0, 1.0, 0.0]
788
 
789
  def normalize_text(text):
790
- return re.sub(r'\s+', ' ', text.strip().lower())
 
791
 
792
- def find_text_fuzzy(page, search_text, threshold=0.65):
793
- """ํผ์ง€ ๋งค์นญ์œผ๋กœ ํ…์ŠคํŠธ ์˜์—ญ ์ฐพ๊ธฐ (์ž„๊ณ„๊ฐ’ ๋‚ฎ์ถค)"""
794
- search_norm = normalize_text(search_text)
 
795
 
796
- # 1. ์ •ํ™•ํ•œ ๋งค์นญ ์‹œ๋„
797
- variations = [
798
- search_text,
799
- search_text.replace(' ', ''),
800
- search_text.replace('\n', ' '),
801
- search_text.replace(',', ''),
802
- ]
803
 
804
- for var in variations:
805
- instances = page.search_for(var)
806
- if instances:
807
- return instances
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
 
809
- # 2. ๋ธ”๋ก ๋‹จ์œ„ ํผ์ง€ ๋งค์นญ
810
- blocks = page.get_text("blocks")
811
  for block in blocks:
812
- if len(block) < 5:
813
  continue
814
-
815
- block_text = block[4]
816
- block_norm = normalize_text(block_text)
817
-
818
- similarity = SequenceMatcher(None, search_norm, block_norm).ratio()
819
- if similarity >= threshold:
820
- return [fitz.Rect(block[0], block[1], block[2], block[3])]
821
 
822
- # 3. ๋‹จ์–ด ๋‹จ์œ„ ๋งค์นญ
823
- words = page.get_text("words")
824
- if not words:
825
  return []
826
 
827
- search_words = search_norm.split()
828
- min_words = max(2, len(search_words) // 3) # 1/3๋งŒ ๋งค์นญ๋˜์–ด๋„ OK
829
-
830
- best_match = None
831
- best_sim = 0.0
832
 
833
- for i in range(len(words)):
834
- for size in range(len(search_words), min_words - 1, -1):
835
- if i + size > len(words):
836
- continue
 
 
 
 
 
837
 
838
- window = words[i:i + size]
839
- window_text = " ".join([w[4] for w in window])
840
- window_norm = normalize_text(window_text)
841
 
842
- sim = SequenceMatcher(None, search_norm, window_norm).ratio()
843
- if sim > best_sim and sim >= threshold:
844
- best_sim = sim
845
- rect = fitz.Rect(window[0][:4])
846
- for w in window[1:]:
847
- rect = rect | fitz.Rect(w[:4])
848
- best_match = rect
849
-
850
- if best_match:
851
- return [best_match]
852
 
853
  return []
854
 
855
- for item in highlight_info:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
856
  page_num = item['page'] - 1
857
- full_text = item['text'].strip()
858
 
859
  if page_num >= len(doc):
 
 
860
  continue
861
 
862
  page = doc[page_num]
863
 
864
- # ์ „๋žต 1: ๋งˆ์นจํ‘œ ๊ธฐ์ค€์œผ๋กœ ๋ถ„๋ฆฌ (์‰ผํ‘œ ๋ฌด์‹œ)
865
- sentences = re.split(r'([.ใ€‚]\s*)', full_text)
866
- sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
 
867
 
868
- # ๋งˆ์นจํ‘œ๋ฅผ ์•ž ๋ฌธ์žฅ์— ๋ถ™์ด๊ธฐ
869
- combined = []
870
- i = 0
871
- while i < len(sentences):
872
- if i + 1 < len(sentences) and sentences[i+1] in ['.', 'ใ€‚']:
873
- combined.append(sentences[i] + sentences[i+1])
874
- i += 2
875
- else:
876
- combined.append(sentences[i])
877
- i += 1
878
 
879
- # ๊ฐ ๋ฌธ์žฅ์„ ๊ฐœ๋ณ„์ ์œผ๋กœ ์ฐพ๊ธฐ (8์ž ์ด์ƒ)
880
- found_any = False
881
- for sentence in combined:
882
- if len(sentence) < 8: # 10์ž โ†’ 8์ž๋กœ ๋‚ฎ์ถค
883
- continue
884
-
885
- rects = find_text_fuzzy(page, sentence, threshold=0.60) # 0.70 โ†’ 0.60
886
- if rects:
887
- found_any = True
888
- for rect in rects:
889
- highlight = page.add_highlight_annot(rect)
890
- highlight.set_colors(stroke=yellow_color)
891
- highlight.update()
 
892
 
893
- # ์ „๋žต 2: ๋ฌธ์žฅ๋ณ„๋กœ ์•ˆ ๋˜๋ฉด ์ „์ฒด๋ฅผ ๋” ๋‚ฎ์€ ์ž„๊ณ„๊ฐ’์œผ๋กœ
894
- if not found_any:
895
- rects = find_text_fuzzy(page, full_text, threshold=0.50) # 0.60 โ†’ 0.50
896
- for rect in rects:
897
  highlight = page.add_highlight_annot(rect)
898
  highlight.set_colors(stroke=yellow_color)
899
  highlight.update()
 
 
 
900
 
901
- # ์ „๋žต 3: ๊ทธ๋ž˜๋„ ์•ˆ ๋˜๋ฉด ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๋งŒ์ด๋ผ๋„ ์ฐพ๊ธฐ
902
- if not found_any:
903
- # 10์ž ์ด์ƒ์˜ ๋ช…์‚ฌ๊ตฌ ์ถ”์ถœ
904
- keywords = re.findall(r'[๊ฐ€-ํžฃ]{10,}', full_text)
905
- for kw in keywords[:3]: # ์ƒ์œ„ 3๊ฐœ๋งŒ
906
- rects = find_text_fuzzy(page, kw, threshold=0.70)
907
- for rect in rects:
908
- highlight = page.add_highlight_annot(rect)
909
- highlight.set_colors(stroke=yellow_color)
910
- highlight.update()
911
 
912
  output_bytes = doc.tobytes()
913
  doc.close()
914
  return output_bytes
915
 
916
-
917
  def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
918
  if "error" in grok_result:
919
  return []
@@ -931,29 +1417,149 @@ def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
931
  return highlights
932
 
933
 
 
934
  def extract_highlights_from_answer(answer: str) -> List[Dict]:
935
  """
936
- ๋‹ต๋ณ€์—์„œ ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ผ ์›๋ฌธ๋งŒ ์ถ”์ถœํ•˜์—ฌ ํ•˜์ด๋ผ์ดํŠธ
937
- ํŒจํ„ด: "PDF ์›๋ฌธ" [ํŽ˜์ด์ง€ X]
938
  """
939
- import re
940
-
941
  highlights = []
942
 
943
- # ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ผ ์ธ์šฉ๊ตฌ ์ถ”์ถœ
944
- # ์˜ˆ: "์ œ์•ˆ์—…์ฒด๋Š”... ์ œ์•ˆํ•˜์—ฌ์•ผ ํ•จ" [ํŽ˜์ด์ง€ 9]
945
- pattern = r'"([^"]+)"\s*\[ํŽ˜์ด์ง€\s+(\d+)\]'
946
- matches = re.findall(pattern, answer)
947
 
948
- for quote, quote_page in matches:
949
- highlights.append({
950
- 'text': quote.strip(),
951
- 'page': int(quote_page)
952
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
953
 
954
  return highlights
955
 
956
 
 
957
  def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0):
958
  highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
959
  doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
@@ -974,7 +1580,7 @@ def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoo
974
  if (page_num + 1) in highlighted_pages:
975
  pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">โญ ํŽ˜์ด์ง€ {page_num + 1}</div>'
976
  else:
977
- pdf_html += f'<div style="background: #667eea; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;">๐Ÿ“„ ํŽ˜์ด์ง€ {page_num + 1}</div>'
978
 
979
  pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />'
980
  pdf_html += '</div>'
@@ -990,11 +1596,12 @@ def main():
990
  if not st.session_state.processed:
991
  col1, col2, col3 = st.columns([1, 1, 1])
992
  with col2:
993
- st.image("img/plobin.svg", use_container_width=True)
 
994
  st.text(' ')
995
 
996
  with st.sidebar:
997
- st.image("img/plobin.svg", width=120)
998
 
999
  uploaded_file = st.file_uploader(
1000
  "๋“œ๋ž˜๊ทธํ•˜์—ฌ ํŒŒ์ผ์„ ์—…๋กœ๋“œ ๋˜๋Š” ํด๋ฆญํ•˜์—ฌ ์„ ํƒํ•˜์„ธ์š”.",
@@ -1005,8 +1612,8 @@ def main():
1005
 
1006
  if uploaded_file:
1007
  if st.button("๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘", type="primary", use_container_width=True):
1008
- if not GROK_API_KEY:
1009
- st.error("โš ๏ธ GROK_API_KEY๊ฐ€ .env ํŒŒ์ผ์— ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค!")
1010
  st.stop()
1011
 
1012
  st.session_state.vector_db = None
@@ -1014,11 +1621,11 @@ def main():
1014
  st.session_state.chat_history = []
1015
  st.session_state.current_highlights = []
1016
 
1017
- with st.spinner("๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ค‘..."):
1018
  try:
1019
  chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
1020
 
1021
- with st.spinner("๋ฌธ์„œ๋ฅผ AI๊ฐ€ ์ดํ•ดํ•  ์ˆ˜ ์žˆ๊ฒŒ ์ฒ˜๋ฆฌ ์ค‘.."):
1022
  collection, embedder = create_vector_db(chunks, metadata_list)
1023
 
1024
  st.session_state.vector_db = collection
@@ -1032,7 +1639,14 @@ def main():
1032
  "pages": len(set(m['page'] for m in metadata_list))
1033
  }
1034
 
1035
- st.success("๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
 
 
 
 
 
 
 
1036
  st.rerun()
1037
 
1038
  except Exception as e:
@@ -1043,36 +1657,36 @@ def main():
1043
  st.info(f"**{st.session_state.doc_metadata['filename']}**")
1044
  st.info(f"ํŽ˜์ด์ง€: {st.session_state.doc_metadata['pages']}")
1045
 
1046
- if not st.session_state.processed:
1047
- st.markdown("""
1048
- <div class="usage-guide">
1049
- <h2 style="text-align: center; color: #2D3748; margin-bottom: 1.5rem;">์‚ฌ์šฉ ๋ฐฉ๋ฒ•</h2>
1050
- <div class="guide-step">
1051
- <div class="step-number">1</div>
1052
- <div>PDF ํŒŒ์ผ์„ ์˜ฌ๋ ค์ฃผ์„ธ์š”</div>
1053
- </div>
1054
- <div class="guide-step">
1055
- <div class="step-number">2</div>
1056
- <div>๋ฌธ์„œ ์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋  ๋•Œ๊นŒ์ง€ ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”</div>
1057
- </div>
1058
- <div class="guide-step">
1059
- <div class="step-number">3</div>
1060
- <div>๋ฌธ์„œ ๋‚ด ๊ถ๊ธˆํ•œ ๋‚ด์šฉ์„ ๋ฌผ์–ด๋ณด์„ธ์š”</div>
1061
- </div>
1062
- <div class="guide-step">
1063
- <div class="step-number">4</div>
1064
- <div>AI๊ฐ€ ์ •ํ™•ํ•œ ๋‹ต๋ณ€๊ณผ ์ถœ์ฒ˜๋ฅผ ํ•จ๊ป˜ ์•Œ๋ ค๋“œ๋ ค์š”</div>
1065
- </div>
1066
- </div>
1067
- """, unsafe_allow_html=True)
1068
-
1069
- else:
1070
  col1, col2 = st.columns([1, 1])
1071
 
1072
  with col1:
1073
  header_cols = st.columns([7, 1, 1.5, 1])
1074
  with header_cols[0]:
1075
- st.markdown("### ๋ฌธ์„œ ๋ทฐ์–ด")
1076
 
1077
  if st.session_state.pdf_bytes:
1078
  pdf_html = render_pdf_with_highlights(
@@ -1105,7 +1719,7 @@ def main():
1105
  st.session_state.scroll_to_page = None
1106
 
1107
  with col2:
1108
- st.markdown('### PLOBIN CHAT', unsafe_allow_html=True)
1109
 
1110
  chat_container = st.container(height=650)
1111
 
@@ -1113,41 +1727,20 @@ def main():
1113
  for msg_idx, msg in enumerate(st.session_state.chat_history):
1114
  with st.chat_message(msg["role"]):
1115
  st.markdown(msg["content"])
1116
-
1117
- if msg["role"] == "assistant" and "sources" in msg:
1118
- with st.expander("๐Ÿ“š ์ฐธ์กฐ ๋ฌธ์„œ"):
1119
- for idx, (doc, meta) in enumerate(zip(msg["sources"]["docs"], msg["sources"]["metas"])):
1120
- clean_text = doc[:150] + ('...' if len(doc) > 150 else '')
1121
-
1122
- if st.button(
1123
- f"ํŽ˜์ด์ง€ {meta['page']}",
1124
- key=f"goto_source_msg{msg_idx}_{meta['page']}_{idx}",
1125
- use_container_width=True,
1126
- type="secondary"
1127
- ):
1128
- st.session_state.scroll_to_page = meta['page']
1129
- st.rerun()
1130
-
1131
- st.markdown(f"""
1132
- <div style="background: #F1F5F9; padding: 0.8rem; border-radius: 0.5rem; margin-bottom: 1rem; border-left: 3px solid #667eea;">
1133
- <div style="font-size: 0.9rem; color: #475569;">
1134
- {clean_text}
1135
- </div>
1136
- </div>
1137
- """, unsafe_allow_html=True)
1138
 
1139
- prompt = st.chat_input("๐Ÿ’ฌ ์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”...", key="chat_input")
1140
 
1141
  if prompt:
1142
  st.session_state.chat_history.append({"role": "user", "content": prompt})
1143
  st.session_state.processing_query = prompt
1144
  st.rerun()
1145
 
 
1146
  if st.session_state.processing_query:
1147
  query = st.session_state.processing_query
1148
  st.session_state.processing_query = None
1149
 
1150
- with st.spinner("PLOBIN์ด ๊ฒ€์ƒ‰์ค‘์ž…๋‹ˆ๋‹ค..."):
1151
  try:
1152
  search_results = hybrid_search(
1153
  query,
@@ -1168,7 +1761,16 @@ def main():
1168
  GROK_API_KEY
1169
  )
1170
 
 
 
 
 
1171
  highlights = extract_highlights_from_answer(answer)
 
 
 
 
 
1172
  st.session_state.current_highlights = highlights
1173
 
1174
  if grok_result and "page" in grok_result and "error" not in grok_result:
@@ -1176,14 +1778,7 @@ def main():
1176
 
1177
  chat_data = {
1178
  "role": "assistant",
1179
- "content": answer,
1180
- "sources": {
1181
- "docs": search_results['documents'][0],
1182
- "metas": search_results['metadatas'][0],
1183
- "scores": search_results.get('scores', []),
1184
- "keywords": search_results.get('keywords', []),
1185
- "grok_verified": grok_result
1186
- }
1187
  }
1188
  st.session_state.chat_history.append(chat_data)
1189
  st.rerun()
@@ -1198,4 +1793,4 @@ def main():
1198
 
1199
 
1200
  if __name__ == "__main__":
1201
- main()
 
1
  """
2
  PLOBIN
3
  """
4
+ import difflib
5
  import streamlit as st
6
  import streamlit.components.v1 as components
7
  import fitz # PyMuPDF
 
18
  from dotenv import load_dotenv
19
  import json
20
  from difflib import SequenceMatcher
21
+ import pdfplumber
22
 
23
  def get_svg_content(svg_path):
24
  with open(svg_path, "r", encoding="utf-8") as f:
 
30
 
31
  GROK_API_KEY = os.getenv("GROK_API_KEY")
32
  GROK_API_BASE = "https://api.x.ai/v1"
33
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
34
+ OPENAI_API_BASE = "https://api.openai.com/v1"
35
  CHROMA_DIR = "./chroma_db"
36
  EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
37
 
 
50
  <style>
51
  [data-testid="stSidebar"] {
52
  background: linear-gradient(180deg,
53
+ #f9f9f9 0%,
54
+ #f9f9f9 100%);
55
+ box-shadow: none;
56
+ border-right: 1px solid #ededed;
57
+ width: 280px !important;
58
  }
59
+
60
 
61
  [data-testid="stSidebar"] h1 {
62
  color: white !important;
 
68
  animation: sidebarTitlePulse 4s ease-in-out infinite;
69
  letter-spacing: 2px;
70
  }
71
+
72
  @keyframes sidebarTitlePulse {
73
  0%, 100% {
74
  transform: scale(1);
 
88
  }
89
 
90
  [data-testid="stSidebar"] [data-testid="stFileUploader"] {
91
+ background: rgba(198,198,198,0.15);
92
  border-radius: 15px;
93
  padding: 1.5rem;
94
+ border: 1.5px dashed rgba(198,198,198,0.4);
95
  transition: all 0.3s ease;
96
  backdrop-filter: blur(10px);
97
  }
98
+
99
+
100
 
101
  [data-testid="stFileUploader"] > section {
102
  background: transparent !important;
 
107
  }
108
 
109
  [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] {
110
+ color: #c6c6c6;
111
  }
112
 
113
+ /* ์‚ฌ์šฉ์ž ๋ฉ”์‹œ์ง€ ์•„์ด์ฝ˜ ๋ณ€๊ฒฝ */
114
+ [data-testid="stChatMessage"][data-testid="user"]
115
+ [data-testid="chat-message-avatar"] img {
116
+ content: url("https://your-image-url.com/user-icon.png") !important;
117
+ }
118
+
119
  [data-testid="stSidebar"] [data-testid="stFileUploader"] > section,
120
  [data-testid="stSidebar"] [data-testid="stFileUploader"] section > div {
121
  background: transparent !important;
 
123
  }
124
 
125
  [data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p {
126
+ color: #555555 !important;
127
  }
128
 
129
  [data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] {
130
+ background: rgba(127,128,134,0.2) !important;
131
+ color: #8A8A8A !important;
132
+ border: 1px solid rgba(127,128,134,0.3) !important;
133
  }
134
 
135
  [data-testid="stSidebar"] .stButton button {
136
+ background: rgba(127,128,134,0.15) !important;
137
+ color: #555555 !important;
138
+ border: 2px solid rgba(127,128,134,0.4) !important;
139
  border-radius: 12px !important;
140
  font-weight: 700 !important;
141
  padding: 0.75rem 1.5rem !important;
 
145
  }
146
 
147
  [data-testid="stSidebar"] .stButton button:hover {
148
+ background: rgba(255, 36, 36,0.25) !important;
149
+ border-color: rgba(255, 36, 36,0.6) !important;
150
  transform: translateY(-2px) scale(1.02) !important;
 
151
  }
152
 
153
  [data-testid="stSidebar"] .stButton button:active {
 
166
  }
167
 
168
  [data-testid="stSidebar"] [data-testid="stAlert"] {
169
+ background-color: #f2f2f2 !important;
170
  border-radius: 0.5rem !important;
171
  }
172
+
173
+
174
+ }
175
+ [data-testid="stSidebar"] [data-testid="stFileUploader"] button {
176
+ display: block;
177
+ }
178
 
179
+ /* ์‚ฌ์ด๋“œ๋ฐ” ์ ‘๊ธฐ/ํŽผ์น˜๊ธฐ ๋ฒ„ํŠผ ํ•ญ์ƒ ๋ณด์ด๊ฒŒ */
180
+ [data-testid="stSidebarCollapseButton"] {
181
+ opacity: 1 !important;
182
+ visibility: visible !important;
183
+ transition: opacity 0.2s ease !important;
184
+ }
185
+
186
+ /* ํ˜ธ๋ฒ„ ํšจ๊ณผ ์—†์• ๊ณ  ํ•ญ์ƒ ๋ฐ๊ฒŒ */
187
+ [data-testid="stSidebarCollapseButton"]:hover {
188
+ opacity: 1 !important;
189
+ }
190
+
191
  [data-testid="stAlert"] p {
192
+ color: #747474;
193
+ }
194
+
195
+ /* ์‚ฌ์ด๋“œ๋ฐ” Alert ์ „์ฒด ๋ฐ•์Šค ๊ฐ•์ œ ์Šคํƒ€์ผ */
196
+ [data-testid="stSidebar"] [data-testid="stAlert"] {
197
+ background-color: #f2f2f2 !important; /* ์›ํ•˜๋Š” ๋ฐฐ๊ฒฝ์ƒ‰ */
198
+ border-radius: 0.5rem !important;
199
+ }
200
+
201
+ /* Alert ๋‚ด๋ถ€ ์ปจํ…Œ์ด๋„ˆ๊นŒ์ง€ ๊ฐ•์ œ๋กœ ์ƒ‰์ƒ ์ ์šฉ */
202
+ [data-testid="stSidebar"] [data-testid="stAlert"] > div {
203
+ background-color: #f2f2f2 !important;
204
  }
205
+
206
+ /* ๊ฐ€์žฅ ์•ˆ์ชฝ Alert message ๋ฐ•์Šค */
207
+ [data-testid="stSidebar"] [data-testid="stAlert"] [role="alert"] {
208
+ background-color: #f2f2f2 !important;
209
+ }
210
+
211
 
212
  .main .block-container {
213
  max-width: 100%;
 
418
  box-shadow: 0 4px 8px rgba(234, 179, 8, 0.3) !important;
419
  background: linear-gradient(135deg, #FDE047 0%, #FACC15 100%) !important;
420
  }
421
+
422
+ /* ์ฑ„ํŒ… ์ž…๋ ฅ์ฐฝ ํฌ์ปค์Šค ์‹œ ํ…Œ๋‘๋ฆฌ ์ƒ‰์ƒ ๋ณ€๊ฒฝ */
423
+ [data-testid="stChatInput"] textarea:focus {
424
+ border-color: #3f3f3f !important;
425
+ box-shadow: 0 0 0 1px #3f3f3f !important;
426
+ }
427
+
428
+ /* ์ฑ„ํŒ… ์ž…๋ ฅ์ฐฝ ๊ธฐ๋ณธ ์ƒํƒœ */
429
+ [data-testid="stChatInput"] textarea {
430
+ border-color: #3f3f3f; !important;
431
+ transition: border-color 0.2s ease;
432
+ }
433
+
434
+ /* ํ˜ธ๋ฒ„ ์ƒํƒœ */
435
+ [data-testid="stChatInput"] textarea:hover {
436
+ border-color: #3f3f3f; !important;
437
+ }
438
+
439
+ /* Streamlit ๊ธฐ๋ณธ ์•„๋ฐ”ํƒ€ ์ˆจ๊ธฐ๊ธฐ */
440
+ [data-testid="stChatMessage"][data-testid="user"]
441
+ [data-testid="chat-message-avatar"] img {
442
+ display: none !important;
443
+ }
444
+
445
+ /* ์›ํ•˜๋Š” ์•„์ด์ฝ˜์œผ๋กœ ๊ต์ฒด */
446
+ [data-testid="stChatMessage"][data-testid="user"]
447
+ [data-testid="chat-message-avatar"] {
448
+ background-image: "final/img/user-profile.png";
449
+ background-size: cover;
450
+ background-position: center;
451
+ width: 36px !important;
452
+ height: 36px !important;
453
+ border-radius: 50%; /* ์›ํ˜• */
454
+ }
455
+
456
+ /* ๊ธฐ๋ณธ ์•„๋ฐ”ํƒ€ ์ œ๊ฑฐ */
457
+ [data-testid="stChatMessage"][data-testid="assistant"]
458
+ [data-testid="chat-message-avatar"] img {
459
+ display: none !important;
460
+ }
461
+
462
+ /* ์ปค์Šคํ…€ ์•„์ด์ฝ˜ ์ง€์ • */
463
+ [data-testid="stChatMessage"][data-testid="assistant"]
464
+ [data-testid="chat-message-avatar"] {
465
+ background-image: "final/img/cloud.png;
466
+ background-size: cover;
467
+ background-position: center;
468
+ width: 36px !important;
469
+ height: 36px !important;
470
+ border-radius: 50%;
471
+ }
472
+
473
+
474
+
475
  </style>
476
  """, unsafe_allow_html=True)
477
 
478
+ SPACE_RE = re.compile(r'\s+')
479
+
480
+ def normalize_for_search(text: str) -> str:
481
+ """
482
+ ๊ฒ€์ƒ‰/๋งค์นญ์šฉ ํ…์ŠคํŠธ ์ •๊ทœํ™”:
483
+ - ์–‘๋ ๊ณต๋ฐฑ ์ œ๊ฑฐ
484
+ - ์†Œ๋ฌธ์ž ๋ณ€ํ™˜
485
+ - ๋ชจ๋“  ๊ณต๋ฐฑ ๋ฌธ์ž ์ œ๊ฑฐ (๋„์–ด์“ฐ๊ธฐ ์ฐจ์ด ๋ฌด์‹œ)
486
+ """
487
+ text = text.strip().lower()
488
+ text = SPACE_RE.sub('', text) # ๋ชจ๋“  ๊ณต๋ฐฑ ๋‚ ๋ฆฌ๊ธฐ
489
+ return text
490
+
491
 
492
  def init_session():
493
  if 'processed' not in st.session_state:
 
516
  st.session_state.scroll_to_page = None
517
 
518
 
519
+ def extract_table_image_as_base64(pdf_bytes: bytes, page_num: int, bbox: tuple) -> str:
520
+ """
521
+ PDF ํŽ˜์ด์ง€์—์„œ ํ‘œ ์˜์—ญ์„ ์ด๋ฏธ์ง€๋กœ ์ถ”์ถœํ•˜์—ฌ base64๋กœ ์ธ์ฝ”๋”ฉ
522
+
523
+ Args:
524
+ pdf_bytes: PDF ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ
525
+ page_num: ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ (0๋ถ€ํ„ฐ ์‹œ์ž‘)
526
+ bbox: (x0, y0, x1, y1) ํ‘œ ์˜์—ญ ์ขŒํ‘œ
527
+
528
+ Returns:
529
+ base64 ์ธ์ฝ”๋”ฉ๋œ ์ด๋ฏธ์ง€ ๋ฌธ์ž์—ด
530
+ """
531
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
532
+ page = doc[page_num]
533
+
534
+ # bbox ์˜์—ญ์„ ์ด๋ฏธ์ง€๋กœ ๋ Œ๋”๋ง (๊ณ ํ•ด์ƒ๋„)
535
+ rect = fitz.Rect(bbox)
536
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0), clip=rect)
537
+ img_bytes = pix.tobytes("png")
538
+
539
+ doc.close()
540
+
541
+ # base64 ์ธ์ฝ”๋”ฉ
542
+ img_base64 = base64.b64encode(img_bytes).decode('utf-8')
543
+ return img_base64
544
+
545
+
546
+ def convert_table_to_markdown_with_vision(
547
+ pdf_bytes: bytes,
548
+ page_num: int,
549
+ bbox: tuple,
550
+ api_key: str
551
+ ) -> str:
552
+ """
553
+ OpenAI Vision API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ‘œ ์ด๋ฏธ์ง€๋ฅผ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ๋ณ€ํ™˜
554
+
555
+ Args:
556
+ pdf_bytes: PDF ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ
557
+ page_num: ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
558
+ bbox: ํ‘œ ์˜์—ญ ์ขŒํ‘œ
559
+ api_key: OpenAI API ํ‚ค
560
+
561
+ Returns:
562
+ ๋งˆํฌ๋‹ค์šด ํ˜•์‹์˜ ํ‘œ
563
+ """
564
+ # ํ‘œ ์˜์—ญ ์ด๋ฏธ์ง€ ์ถ”์ถœ
565
+ img_base64 = extract_table_image_as_base64(pdf_bytes, page_num, bbox)
566
+
567
+ # OpenAI Vision API ํ˜ธ์ถœ
568
+ prompt = """์ด ์ด๋ฏธ์ง€๋Š” PDF ๋ฌธ์„œ์˜ ํ‘œ์ž…๋‹ˆ๋‹ค.
569
+ ํ‘œ์˜ ๋‚ด์šฉ์„ ์ •ํ™•ํ•˜๊ฒŒ ๋งˆํฌ๋‹ค์šด ํ‘œ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•ด์ฃผ์„ธ์š”.
570
+
571
+ ๊ทœ์น™:
572
+ 1. ์…€ ๋ณ‘ํ•ฉ์ด ์žˆ์œผ๋ฉด ์ ์ ˆํžˆ ์ฒ˜๋ฆฌ
573
+ 2. ์ค‘์ฒฉ๋œ ํ‘œ๊ฐ€ ์žˆ์œผ๋ฉด ํ…์ŠคํŠธ๋กœ ํ‘œํ˜„
574
+ 3. ๋นˆ ์…€์€ ๋นˆ ์นธ์œผ๋กœ ์œ ์ง€
575
+ 4. ํ‘œ ํ˜•์‹๋งŒ ๋ฐ˜ํ™˜ (์ถ”๊ฐ€ ์„ค๋ช… ์—†์ด)
576
+
577
+ ๋งˆํฌ๋‹ค์šด ํ‘œ ํ˜•์‹:
578
+ | ์—ด1 | ์—ด2 | ์—ด3 |
579
+ | --- | --- | --- |
580
+ | ๋ฐ์ดํ„ฐ1 | ๋ฐ์ดํ„ฐ2 | ๋ฐ์ดํ„ฐ3 |"""
581
+
582
+ try:
583
+ response = requests.post(
584
+ f"{OPENAI_API_BASE}/chat/completions",
585
+ headers={
586
+ "Authorization": f"Bearer {api_key}",
587
+ "Content-Type": "application/json"
588
+ },
589
+ json={
590
+ "model": "gpt-4o", # gpt-4o ๋˜๋Š” gpt-4o-mini
591
+ "messages": [
592
+ {
593
+ "role": "user",
594
+ "content": [
595
+ {
596
+ "type": "text",
597
+ "text": prompt
598
+ },
599
+ {
600
+ "type": "image_url",
601
+ "image_url": {
602
+ "url": f"data:image/png;base64,{img_base64}",
603
+ "detail": "high" # "low", "high", "auto"
604
+ }
605
+ }
606
+ ]
607
+ }
608
+ ],
609
+ "temperature": 0.1,
610
+ "max_tokens": 2000
611
+ },
612
+ timeout=120
613
+ )
614
+
615
+ if response.status_code == 200:
616
+ result = response.json()
617
+ markdown_table = result['choices'][0]['message']['content']
618
+
619
+ # ์ฝ”๋“œ๋ธ”๋ก ์ œ๊ฑฐ
620
+ markdown_table = re.sub(r'```markdown\s*|\s*```', '', markdown_table)
621
+ markdown_table = re.sub(r'```\s*|\s*```', '', markdown_table)
622
+
623
+ return markdown_table.strip()
624
+ else:
625
+ # ์—๋Ÿฌ ์ƒ์„ธ ์ถœ๋ ฅ
626
+ error_detail = response.text
627
+ print(f"OpenAI API ์˜ค๋ฅ˜: {response.status_code}")
628
+ print(f"์ƒ์„ธ: {error_detail}")
629
+ return f"[ํ‘œ ๋ณ€ํ™˜ ์‹คํŒจ: {response.status_code} - {error_detail[:200]}]"
630
+
631
+ except Exception as e:
632
+ return f"[ํ‘œ ๋ณ€ํ™˜ ์‹คํŒจ: {str(e)}]"
633
+
634
  def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
635
+ """
636
+ PDF์—์„œ ํ…์ŠคํŠธ์™€ ํ‘œ๋ฅผ ์ถ”์ถœ (ํ‘œ๋Š” Grok Vision API๋กœ ์ฒ˜๋ฆฌ)
637
+ """
638
  pdf_bytes = pdf_file.read()
639
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
640
 
 
645
  CHUNK_SIZE = 800
646
  OVERLAP_SIZE = 150
647
 
648
+ # pdfplumber๋กœ PDF ์—ด๊ธฐ
649
+ pdf_file.seek(0)
650
+
651
+ with pdfplumber.open(pdf_file) as pdf_plumber:
652
+ for page_num in range(len(doc)):
653
+ # PyMuPDF๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ
654
+ fitz_page = doc[page_num]
655
+ text = fitz_page.get_text("text")
 
 
 
 
 
 
 
 
 
 
 
656
 
657
+ # pdfplumber๋กœ ํ‘œ ํƒ์ง€
658
+ tables_markdown = []
659
+ if page_num < len(pdf_plumber.pages):
660
+ plumber_page = pdf_plumber.pages[page_num]
 
 
 
661
 
662
+ # ํ‘œ ํƒ์ง€
663
+ table_settings = {
664
+ "vertical_strategy": "lines",
665
+ "horizontal_strategy": "lines",
666
+ "snap_tolerance": 3,
667
+ "join_tolerance": 3,
668
+ }
669
+
670
+ tables = plumber_page.find_tables(table_settings=table_settings)
671
+
672
+ # ๊ฐ ํ‘œ๋ฅผ Vision API๋กœ ์ฒ˜๋ฆฌ
673
+ for idx, table in enumerate(tables):
674
+ bbox = table.bbox # (x0, y0, x1, y1)
675
+
676
+ # Grok Vision API๋กœ ๋งˆํฌ๋‹ค์šด ๋ณ€ํ™˜
677
+ markdown_table = convert_table_to_markdown_with_vision(
678
+ pdf_bytes,
679
+ page_num,
680
+ bbox,
681
+ OPENAI_API_KEY
682
+ )
683
+
684
+ tables_markdown.append(f"\n\n**[ํ‘œ {idx + 1}]**\n{markdown_table}\n")
685
+
686
+ # ํ…์ŠคํŠธ์™€ ํ‘œ๋ฅผ ๊ฒฐํ•ฉ
687
+ combined_content = text
688
+ if tables_markdown:
689
+ combined_content += "\n\n" + "\n".join(tables_markdown)
690
+
691
+ pages_text[page_num + 1] = combined_content
692
+
693
+ if not combined_content.strip():
694
+ continue
695
+
696
+ # ์ฒญํฌ๋กœ ๋ถ„ํ• 
697
+ lines = [line.strip() for line in combined_content.split('\n') if line.strip()]
698
+ cleaned_text = '\n'.join(lines)
699
+
700
+ # ํ‘œ ๋งˆ์ปค๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ถ„ํ•  ์šฐ์„  ์ฒ˜๋ฆฌ
701
+ if "**[ํ‘œ" in cleaned_text:
702
+ # ํ‘œ ๋‹จ์œ„๋กœ ๋ถ„ํ• 
703
+ table_pattern = r'\*\*\[ํ‘œ \d+\]\*\*'
704
+ parts = re.split(f'({table_pattern})', cleaned_text)
705
+
706
+ current_chunk = ""
707
+ for part in parts:
708
+ part = part.strip()
709
+ if not part:
710
+ continue
711
+
712
+ # ํ‘œ ์„น์…˜์ธ ๊ฒฝ์šฐ
713
+ if re.match(table_pattern, part):
714
+ if current_chunk:
715
+ chunks.append(current_chunk.strip())
716
+ metadata_list.append({
717
+ "page": page_num + 1,
718
+ "source": pdf_file.name,
719
+ "chunk_type": "text"
720
+ })
721
+ current_chunk = ""
722
+ current_chunk = part
723
+ else:
724
+ # ํ‘œ ๋‚ด์šฉ์ด๊ฑฐ๋‚˜ ์ผ๋ฐ˜ ํ…์ŠคํŠธ
725
+ if current_chunk and re.match(table_pattern, current_chunk):
726
+ # ์ด์ „์ด ํ‘œ ๋งˆ์ปค์˜€๋‹ค๋ฉด ํ‘œ ๋‚ด์šฉ ์ถ”๊ฐ€
727
+ current_chunk += "\n" + part
728
+ chunks.append(current_chunk.strip())
729
+ metadata_list.append({
730
+ "page": page_num + 1,
731
+ "source": pdf_file.name,
732
+ "chunk_type": "table"
733
+ })
734
+ current_chunk = ""
735
+ else:
736
+ # ์ผ๋ฐ˜ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ
737
+ if len(current_chunk) + len(part) > CHUNK_SIZE:
738
+ if current_chunk:
739
+ chunks.append(current_chunk.strip())
740
+ metadata_list.append({
741
+ "page": page_num + 1,
742
+ "source": pdf_file.name,
743
+ "chunk_type": "text"
744
+ })
745
+ current_chunk = part
746
+ else:
747
+ current_chunk += "\n" + part if current_chunk else part
748
+
749
+ if current_chunk:
750
+ chunk_type = "table" if re.match(table_pattern, current_chunk) else "text"
751
+ chunks.append(current_chunk.strip())
752
+ metadata_list.append({
753
+ "page": page_num + 1,
754
+ "source": pdf_file.name,
755
+ "chunk_type": chunk_type
756
+ })
757
  else:
758
+ # ํ‘œ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ์ผ๋ฐ˜ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ
759
+ sentences = re.split(r'([.!?]\s+|\n{2,})', cleaned_text)
760
+ sentences = [s for s in sentences if s.strip()]
761
+
762
+ current_chunk = ""
763
+ current_length = 0
764
+
765
+ for sentence in sentences:
766
+ sentence_length = len(sentence)
767
+
768
+ if current_length + sentence_length > CHUNK_SIZE and current_chunk:
769
+ chunks.append(current_chunk.strip())
770
+ metadata_list.append({
771
+ "page": page_num + 1,
772
+ "source": pdf_file.name,
773
+ "chunk_type": "text"
774
+ })
775
+
776
+ overlap_text = current_chunk[-OVERLAP_SIZE:] if len(current_chunk) > OVERLAP_SIZE else current_chunk
777
+ current_chunk = overlap_text + sentence
778
+ current_length = len(current_chunk)
779
+ else:
780
+ current_chunk += sentence
781
+ current_length += sentence_length
782
+
783
+ if current_chunk.strip():
784
+ chunks.append(current_chunk.strip())
785
+ metadata_list.append({
786
+ "page": page_num + 1,
787
+ "source": pdf_file.name,
788
+ "chunk_type": "text"
789
+ })
790
 
791
  doc.close()
792
  return chunks, metadata_list, pdf_bytes, pages_text
793
 
794
 
795
+ def save_extracted_text_to_file(chunks: List[str], metadata_list: List[Dict], filename: str):
796
+ """
797
+ ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋กœ์ปฌ ํŒŒ์ผ๋กœ ์ €์žฅ
798
+ """
799
+ import os
800
+ from datetime import datetime
801
+
802
+ # ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
803
+ output_dir = "extracted_text"
804
+ os.makedirs(output_dir, exist_ok=True)
805
+
806
+ # ํŒŒ์ผ๋ช… ์ƒ์„ฑ (ํƒ€์ž„์Šคํƒฌํ”„ ํฌํ•จ)
807
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
808
+ base_name = os.path.splitext(filename)[0]
809
+ output_file = os.path.join(output_dir, f"{base_name}_{timestamp}.txt")
810
+
811
+ # ํ…์ŠคํŠธ ์ €์žฅ
812
+ with open(output_file, 'w', encoding='utf-8') as f:
813
+ f.write(f"=" * 80 + "\n")
814
+ f.write(f"๋ฌธ์„œ๋ช…: {filename}\n")
815
+ f.write(f"์ถ”์ถœ ์‹œ๊ฐ„: {timestamp}\n")
816
+ f.write(f"์ด ์ฒญํฌ ์ˆ˜: {len(chunks)}\n")
817
+ f.write(f"=" * 80 + "\n\n")
818
+
819
+ for idx, (chunk, meta) in enumerate(zip(chunks, metadata_list), 1):
820
+ f.write(f"\n{'='*80}\n")
821
+ f.write(f"์ฒญํฌ #{idx}\n")
822
+ f.write(f"ํŽ˜์ด์ง€: {meta.get('page', 'N/A')}\n")
823
+ f.write(f"ํƒ€์ž…: {meta.get('chunk_type', 'text')}\n")
824
+ f.write(f"{'-'*80}\n")
825
+ f.write(chunk)
826
+ f.write(f"\n{'='*80}\n")
827
+
828
+ return output_file
829
+
830
+ @st.cache_resource(show_spinner=False)
831
  def load_embedding_model():
832
  return SentenceTransformer(EMBEDDING_MODEL)
833
 
 
923
  vector_score = 1 - vector_results['distances'][0][i]
924
 
925
  keyword_score = 0
926
+
927
+ # ์›๋ฌธ/์ •๊ทœํ™” ๋‘˜ ๋‹ค ์ค€๋น„
928
  doc_lower = doc.lower()
929
+ doc_norm = normalize_for_search(doc) # ๊ณต๋ฐฑ ์ œ๊ฑฐ ๋ฒ„์ „
930
+
931
  for keyword in keywords:
932
+ kw_lower = keyword.lower()
933
+ kw_norm = normalize_for_search(keyword)
934
+
935
+ # 1) ์›๋ž˜ ๋ฐฉ์‹: ๊ทธ๋Œ€๋กœ ํฌํ•จ ์—ฌ๋ถ€
936
+ # 2) ๊ณต๋ฐฑ ์ œ๊ฑฐ ๋ฒ„์ „: ๋ถ™์–ด ์žˆ๊ฑฐ๋‚˜ ์ด์ƒํ•˜๊ฒŒ ๋„์–ด์ ธ๋„ ๋งค์นญ ๊ฐ€๋Šฅ
937
+ if kw_lower in doc_lower or kw_norm in doc_norm:
938
  keyword_score += 1
 
939
 
940
+ keyword_score = keyword_score / len(keywords) if keywords else 0
941
  hybrid_score = 0.7 * vector_score + 0.3 * keyword_score
942
 
943
  hybrid_results.append({
 
1019
  f"{GROK_API_BASE}/chat/completions",
1020
  headers=headers,
1021
  json=payload,
1022
+ timeout=120
1023
  )
1024
 
1025
  if response.status_code != 200:
 
1136
  f"{GROK_API_BASE}/chat/completions",
1137
  headers=headers,
1138
  json=payload,
1139
+ timeout=120
1140
  )
1141
 
1142
  if response.status_code != 200:
 
1158
 
1159
  def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
1160
  """
1161
+ PyMuPDF ๊ธฐ๋ฐ˜์˜ ํ•˜์ด๋ผ์ดํŠธ ํ•จ์ˆ˜ - ์ „์ฒด ์šฐ์„ , ์‹คํŒจ์‹œ์—๋งŒ ๋ถ„ํ• 
1162
  """
1163
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
1164
  yellow_color = [1.0, 1.0, 0.0]
1165
 
1166
  def normalize_text(text):
1167
+ """ํ…์ŠคํŠธ ์ •๊ทœํ™”"""
1168
+ return re.sub(r'\s+', ' ', text.strip())
1169
 
1170
+ def merge_rects(rects, threshold=5):
1171
+ """๊ฒน์น˜๊ฑฐ๋‚˜ ์ธ์ ‘ํ•œ ์‚ฌ๊ฐํ˜•๋“ค์„ ๋ณ‘ํ•ฉ"""
1172
+ if not rects:
1173
+ return []
1174
 
1175
+ # ์‚ฌ๊ฐํ˜•๋“ค์„ y์ขŒํ‘œ๋กœ ์ •๋ ฌ
1176
+ sorted_rects = sorted(rects, key=lambda r: (r.y0, r.x0))
1177
+ merged = [sorted_rects[0]]
 
 
 
 
1178
 
1179
+ for rect in sorted_rects[1:]:
1180
+ last = merged[-1]
1181
+ # ๊ฐ™์€ ๋ผ์ธ์ด๊ณ  x๊ฐ€ ๊ฒน์น˜๊ฑฐ๋‚˜ ์ธ์ ‘ํ•˜๋ฉด ๋ณ‘ํ•ฉ
1182
+ if abs(rect.y0 - last.y0) < threshold:
1183
+ if rect.x0 <= last.x1 + threshold:
1184
+ merged[-1] = fitz.Rect(
1185
+ min(last.x0, rect.x0),
1186
+ min(last.y0, rect.y0),
1187
+ max(last.x1, rect.x1),
1188
+ max(last.y1, rect.y1)
1189
+ )
1190
+ else:
1191
+ merged.append(rect)
1192
+ # ๋‹ค๋ฅธ ๋ผ์ธ์ด์ง€๋งŒ y๊ฐ€ ์—ฐ์†๋˜๋ฉด (์ค„๋ฐ”๊ฟˆ)
1193
+ elif rect.y0 <= last.y1 + 20:
1194
+ merged.append(rect)
1195
+ else:
1196
+ merged.append(rect)
1197
+
1198
+ return merged
1199
+
1200
+ def find_text_across_lines(page, search_text):
1201
+ """์ค„๋ฐ”๊ฟˆ์„ ๋„˜์–ด์„œ ํ…์ŠคํŠธ ์ฐพ๊ธฐ - ๊ณต๋ฐฑ ๋ฌด์‹œ ๋น„๊ต"""
1202
+ found_rects = []
1203
+
1204
+ # ํŽ˜์ด์ง€ ํ…์ŠคํŠธ ๊ตฌ์กฐ ๊ฐ€์ ธ์˜ค๊ธฐ
1205
+ blocks = page.get_text("dict")["blocks"]
1206
+
1207
+ # ๋ชจ๋“  ๋ผ์ธ์˜ ํ…์ŠคํŠธ์™€ bbox๋ฅผ ์ˆ˜์ง‘
1208
+ lines_info = [] # [(text, bbox), ...]
1209
 
 
 
1210
  for block in blocks:
1211
+ if "lines" not in block:
1212
  continue
1213
+ for line in block["lines"]:
1214
+ line_text = ""
1215
+ for span in line["spans"]:
1216
+ line_text += span["text"]
1217
+ if line_text.strip():
1218
+ lines_info.append((line_text, fitz.Rect(line["bbox"])))
 
1219
 
1220
+ if not lines_info:
 
 
1221
  return []
1222
 
1223
+ # ๊ฒ€์ƒ‰์–ด ์ •๊ทœํ™” - ๊ณต๋ฐฑ ์™„์ „ ์ œ๊ฑฐ ๋ฒ„์ „
1224
+ search_no_space = search_text.lower().replace(" ", "").replace("\n", "")
 
 
 
1225
 
1226
+ # ์—ฐ์†๋œ ๋ผ์ธ๋“ค์„ ํ•ฉ์ณ์„œ ๊ฒ€์ƒ‰
1227
+ for start_idx in range(len(lines_info)):
1228
+ combined_text = ""
1229
+ combined_bboxes = []
1230
+
1231
+ for end_idx in range(start_idx, min(start_idx + 5, len(lines_info))): # ์ตœ๋Œ€ 5์ค„
1232
+ line_text, line_bbox = lines_info[end_idx]
1233
+ combined_text += line_text
1234
+ combined_bboxes.append(line_bbox)
1235
 
1236
+ # ๊ณต๋ฐฑ ์ œ๊ฑฐ ํ›„ ๋น„๊ต (ํ•ต์‹ฌ!)
1237
+ combined_no_space = combined_text.lower().replace(" ", "").replace("\n", "")
 
1238
 
1239
+ # ๊ฒ€์ƒ‰์–ด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธ
1240
+ if search_no_space in combined_no_space:
1241
+ # ๋งค์นญ๋จ - ํ•ด๋‹น ๋ผ์ธ๋“ค์˜ bbox ๋ฐ˜ํ™˜
1242
+ for bbox in combined_bboxes:
1243
+ found_rects.append(bbox)
1244
+ print(f" โœ… ๋ผ์ธ ๋งค์นญ ({start_idx+1}~{end_idx+1}์ค„): {len(combined_bboxes)}๊ฐœ ์˜์—ญ")
1245
+ return merge_rects(found_rects)
 
 
 
1246
 
1247
  return []
1248
 
1249
+ def find_text_with_pymupdf(page, search_text):
1250
+ """PyMuPDF๋กœ ํ…์ŠคํŠธ ์ฐพ๊ธฐ - ์ •ํ™•ํ•˜๊ณ  ๊น”๋”ํ•˜๊ฒŒ"""
1251
+ found_rects = []
1252
+ search_text = search_text.strip()
1253
+
1254
+ print(f" ๊ฒ€์ƒ‰ ์ค‘...")
1255
+
1256
+ # === ์šฐ์„ ์ˆœ์œ„ 1: PyMuPDF ๊ธฐ๋ณธ ๊ฒ€์ƒ‰ ===
1257
+ instances = page.search_for(search_text)
1258
+ if instances:
1259
+ print(f" โœ… ์„ฑ๊ณต [์›๋ณธ]: {len(instances)}๊ฐœ")
1260
+ return merge_rects(instances)
1261
+
1262
+ # === ์šฐ์„ ์ˆœ์œ„ 2: ์ •๊ทœํ™” ํ›„ ๊ฒ€์ƒ‰ ===
1263
+ normalized = normalize_text(search_text)
1264
+ if normalized != search_text:
1265
+ instances = page.search_for(normalized)
1266
+ if instances:
1267
+ print(f" โœ… ์„ฑ๊ณต [์ •๊ทœํ™”]: {len(instances)}๊ฐœ")
1268
+ return merge_rects(instances)
1269
+
1270
+ # === ์šฐ์„ ์ˆœ์œ„ 3: ์ค„๋ฐ”๊ฟˆ ๋„˜์–ด์„œ ๊ฒ€์ƒ‰ (๋ผ์ธ ๋งค์นญ) ===
1271
+ line_results = find_text_across_lines(page, search_text)
1272
+ if line_results:
1273
+ return line_results
1274
+
1275
+ print(f" โš ๏ธ ๋ผ์ธ ๋งค์นญ ์‹คํŒจ โ†’ ํ•ต์‹ฌ ๊ตฌ๋ฌธ")
1276
+
1277
+ # === ์šฐ์„ ์ˆœ์œ„ 4: ํ•ต์‹ฌ ๊ตฌ๋ฌธ๋งŒ ๊ฒ€์ƒ‰ (์ฒ˜์Œ 30์ž + ๋งˆ์ง€๋ง‰ 20์ž) ===
1278
+ if len(search_text) > 50:
1279
+ # ์•ž๋ถ€๋ถ„
1280
+ front = search_text[:30]
1281
+ front_inst = page.search_for(front)
1282
+ if front_inst:
1283
+ print(f" โœ… ์•ž๋ถ€๋ถ„ ๋งค์นญ: {front[:20]}...")
1284
+ found_rects.extend(front_inst[:1]) # ์ฒซ ๋ฒˆ์งธ๋งŒ
1285
+
1286
+ # ๋’ท๋ถ€๋ถ„
1287
+ back = search_text[-20:]
1288
+ back_inst = page.search_for(back)
1289
+ if back_inst:
1290
+ print(f" โœ… ๋’ท๋ถ€๋ถ„ ๋งค์นญ: ...{back[:15]}")
1291
+ found_rects.extend(back_inst[:1]) # ์ฒซ ๋ฒˆ์งธ๋งŒ
1292
+
1293
+ if found_rects:
1294
+ return merge_rects(found_rects)
1295
+
1296
+ print(f" โš ๏ธ ํ•ต์‹ฌ ๊ตฌ๋ฌธ ์‹คํŒจ โ†’ ํ‚ค์›Œ๋“œ")
1297
+
1298
+ # === ์šฐ์„ ์ˆœ์œ„ 5: ํ‚ค์›Œ๋“œ (์ตœ๋Œ€ 2๊ฐœ๋งŒ) ===
1299
+ keywords = re.findall(r'[๊ฐ€-ํžฃ]{10,}', search_text)
1300
+ if not keywords:
1301
+ keywords = re.findall(r'[๊ฐ€-ํžฃ]{7,}', search_text)
1302
+
1303
+ if keywords:
1304
+ for kw in keywords[:2]: # ์ตœ๋Œ€ 2๊ฐœ๋งŒ
1305
+ inst = page.search_for(kw)
1306
+ if inst:
1307
+ print(f" โœ… ํ‚ค์›Œ๋“œ: {kw}")
1308
+ found_rects.extend(inst[:1]) # ์ฒซ ๋ฒˆ์งธ๋งŒ
1309
+
1310
+ if found_rects:
1311
+ return merge_rects(found_rects)
1312
+
1313
+ # === ์šฐ์„ ์ˆœ์œ„ 6: ๋ธ”๋ก ===
1314
+ print(f" ์ตœํ›„: ๋ธ”๋ก")
1315
+ blocks = page.get_text("dict")["blocks"]
1316
+ search_norm = normalize_text(search_text.lower())
1317
+
1318
+ for block in blocks:
1319
+ if "lines" not in block:
1320
+ continue
1321
+
1322
+ block_text = ""
1323
+ for line in block["lines"]:
1324
+ for span in line["spans"]:
1325
+ block_text += span["text"] + " "
1326
+
1327
+ block_norm = normalize_text(block_text.lower())
1328
+
1329
+ if search_norm in block_norm:
1330
+ found_rects.append(fitz.Rect(block["bbox"]))
1331
+ print(f" โœ… ๋ธ”๋ก ์ผ์น˜")
1332
+ break
1333
+
1334
+ return merge_rects(found_rects) if found_rects else []
1335
+
1336
+ print(f"\n{'='*80}")
1337
+ print(f"ํ•˜์ด๋ผ์ดํŠธ ์‹œ์ž‘ - ์ด {len(highlight_info)}๊ฐœ ํ•ญ๋ชฉ")
1338
+ print(f"{'='*80}")
1339
+
1340
+ total_success = 0
1341
+ total_failed = 0
1342
+
1343
+ for idx, item in enumerate(highlight_info, 1):
1344
  page_num = item['page'] - 1
1345
+ text_to_highlight = item['text'].strip()
1346
 
1347
  if page_num >= len(doc):
1348
+ print(f"\n[{idx}] โŒ ํŽ˜์ด์ง€ ์˜ค๋ฅ˜: {page_num + 1}")
1349
+ total_failed += 1
1350
  continue
1351
 
1352
  page = doc[page_num]
1353
 
1354
+ print(f"\n[{idx}/{len(highlight_info)}]")
1355
+ print(f" ๐Ÿ“„ ํŽ˜์ด์ง€: {page_num + 1}")
1356
+ print(f" ๐Ÿ“ ๊ธธ์ด: {len(text_to_highlight)}์ž")
1357
+ print(f" ๐Ÿ’ฌ ๋‚ด์šฉ: {text_to_highlight[:70]}...")
1358
 
1359
+ # ํ…์ŠคํŠธ ์ฐพ๊ธฐ
1360
+ found_rects = find_text_with_pymupdf(page, text_to_highlight)
 
 
 
 
 
 
 
 
1361
 
1362
+ # ์ค‘๋ณต ์ œ๊ฑฐ (๊ฐ™์€ ์œ„์น˜์˜ ์‚ฌ๊ฐํ˜•)
1363
+ unique_rects = []
1364
+ for rect in found_rects:
1365
+ is_duplicate = False
1366
+ for existing in unique_rects:
1367
+ # ์ขŒํ‘œ๊ฐ€ ๊ฑฐ์˜ ๊ฐ™์œผ๋ฉด ์ค‘๋ณต์œผ๋กœ ๊ฐ„์ฃผ
1368
+ if (abs(rect.x0 - existing.x0) < 5 and
1369
+ abs(rect.y0 - existing.y0) < 5 and
1370
+ abs(rect.x1 - existing.x1) < 5 and
1371
+ abs(rect.y1 - existing.y1) < 5):
1372
+ is_duplicate = True
1373
+ break
1374
+ if not is_duplicate:
1375
+ unique_rects.append(rect)
1376
 
1377
+ # ํ•˜์ด๋ผ์ดํŠธ ์ถ”๊ฐ€
1378
+ highlighted_count = 0
1379
+ for rect in unique_rects:
1380
+ try:
1381
  highlight = page.add_highlight_annot(rect)
1382
  highlight.set_colors(stroke=yellow_color)
1383
  highlight.update()
1384
+ highlighted_count += 1
1385
+ except Exception as e:
1386
+ print(f" โœ— ํ•˜์ด๋ผ์ดํŠธ ์‹คํŒจ: {e}")
1387
 
1388
+ if highlighted_count > 0:
1389
+ print(f" โœ… ์™„๋ฃŒ: {highlighted_count}๊ฐœ ์˜์—ญ")
1390
+ total_success += 1
1391
+ else:
1392
+ print(f" โŒ ์‹คํŒจ: ํ…์ŠคํŠธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Œ")
1393
+ total_failed += 1
1394
+
1395
+ print(f"\n{'='*80}")
1396
+ print(f"๐Ÿ“Š ์ตœ์ข… ๊ฒฐ๊ณผ: โœ… ์„ฑ๊ณต {total_success}๊ฐœ / โŒ ์‹คํŒจ {total_failed}๊ฐœ")
1397
+ print(f"{'='*80}\n")
1398
 
1399
  output_bytes = doc.tobytes()
1400
  doc.close()
1401
  return output_bytes
1402
 
 
1403
  def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
1404
  if "error" in grok_result:
1405
  return []
 
1417
  return highlights
1418
 
1419
 
1420
+
1421
  def extract_highlights_from_answer(answer: str) -> List[Dict]:
1422
  """
1423
+ ๋‹ต๋ณ€์—์„œ ํ•˜์ด๋ผ์ดํŠธํ•  ํ…์ŠคํŠธ ์ถ”์ถœ
1424
+ [ํŽ˜์ด์ง€ X] ์•ž๋’ค ๋ชจ๋‘ ํ•ด๋‹น ํŽ˜์ด์ง€๋กœ ๊ฐ„์ฃผ
1425
  """
 
 
1426
  highlights = []
1427
 
1428
+ print(f"\n{'='*80}")
1429
+ print(f"๋‹ต๋ณ€ ํ…์ŠคํŠธ ๋ถ„์„ ์ค‘...")
1430
+ print(f"{'='*80}\n")
 
1431
 
1432
+ # [ํŽ˜์ด์ง€ X] ํŒจํ„ด ์ฐพ๊ธฐ
1433
+ page_pattern = r'\[\s*ํŽ˜์ด์ง€\s*(\d+)\s*\]'
1434
+ page_matches = list(re.finditer(page_pattern, answer))
1435
+
1436
+ print(f"๐Ÿ“ [ํŽ˜์ด์ง€] ํƒœ๊ทธ {len(page_matches)}๊ฐœ ๋ฐœ๊ฒฌ\n")
1437
+
1438
+ quoted_matches = []
1439
+ list_matches = []
1440
+
1441
+ # ๊ฐ [ํŽ˜์ด์ง€ X]์— ๋Œ€ํ•ด ์•ž๋’ค ์„น์…˜ ๋ถ„์„
1442
+ for i, match in enumerate(page_matches):
1443
+ page_num = match.group(1)
1444
+ tag_start = match.start()
1445
+ tag_end = match.end()
1446
+
1447
+ # === ์„น์…˜ 1: [ํŽ˜์ด์ง€ X] ์•ž ๋ถ€๋ถ„ (๊ฐ™์€ ๋‹จ๋ฝ ๋‚ด) ===
1448
+ # ์ด์ „ [ํŽ˜์ด์ง€] ๋˜๋Š” ์ค„๋ฐ”๊ฟˆ 2๊ฐœ๊นŒ์ง€
1449
+ section_start = 0
1450
+ if i > 0:
1451
+ section_start = page_matches[i-1].end()
1452
+
1453
+ # [ํŽ˜์ด์ง€ X] ์•ž์˜ ๊ฐ™์€ ๋‹จ๋ฝ (์ค„๋ฐ”๊ฟˆ 2๊ฐœ ์ „๊นŒ์ง€)
1454
+ before_section = answer[section_start:tag_start]
1455
+
1456
+ # ๋งˆ์ง€๋ง‰ ๋ถˆ๋ฆฟ ํฌ์ธํŠธ๋‚˜ ์ธ์šฉ๋ฌธ ์ฐพ๊ธฐ
1457
+ last_para_match = re.search(r'([-*โ—‹]\s+.+)$', before_section, re.DOTALL)
1458
+ if last_para_match:
1459
+ before_text = last_para_match.group(1)
1460
+ print(f"--- ํŽ˜์ด์ง€ {page_num} ์•ž๋ถ€๋ถ„ (๊ธธ์ด: {len(before_text)}์ž) ---")
1461
+ print(f"{before_text[:150]}...\n")
1462
+
1463
+ # ํฐ๋”ฐ์˜ดํ‘œ ์ธ์šฉ๋ฌธ ์ถ”์ถœ
1464
+ quotes = re.findall(r'"([^"]+)"', before_text)
1465
+ for quote in quotes:
1466
+ quote_clean = quote.strip()
1467
+ if len(quote_clean) > 10:
1468
+ quoted_matches.append((quote_clean, int(page_num)))
1469
+ print(f" โœ“ [์•ž-์ธ์šฉ๋ฌธ] \"{quote_clean[:60]}...\"")
1470
+
1471
+ # === ์„น์…˜ 2: [ํŽ˜์ด์ง€ X] ๋’ค ๋ถ€๋ถ„ (๊ธฐ์กด ๋กœ์ง) ===
1472
+ next_page_pos = len(answer)
1473
+ if i + 1 < len(page_matches):
1474
+ next_page_pos = page_matches[i + 1].start()
1475
+
1476
+ section = answer[tag_end:next_page_pos]
1477
+ print(f"--- ํŽ˜์ด์ง€ {page_num} ๋’ท๋ถ€๋ถ„ (๊ธธ์ด: {len(section)}์ž) ---")
1478
+ print(f"{section[:150]}...\n")
1479
+
1480
+ # ํฐ๋”ฐ์˜ดํ‘œ ์ธ์šฉ๋ฌธ
1481
+ quotes = re.findall(r'"([^"]+)"', section)
1482
+ for quote in quotes:
1483
+ quote_clean = quote.strip()
1484
+ if len(quote_clean) > 10:
1485
+ quoted_matches.append((quote_clean, int(page_num)))
1486
+ print(f" โœ“ [๋’ค-์ธ์šฉ๋ฌธ] \"{quote_clean[:60]}...\"")
1487
+
1488
+ # ๋ฆฌ์ŠคํŠธ ํ•ญ๋ชฉ
1489
+ lines = section.split('\n')
1490
+ for line in lines:
1491
+ line_stripped = line.strip()
1492
+
1493
+ if len(line_stripped) < 3:
1494
+ continue
1495
+
1496
+ if line_stripped.startswith('**') or line_stripped.startswith('#'):
1497
+ continue
1498
+
1499
+ item = None
1500
+
1501
+ if line_stripped.startswith('โ—‹'):
1502
+ item = line_stripped[1:].strip()
1503
+ elif line_stripped.startswith('- ') or line_stripped.startswith('* '):
1504
+ item = line_stripped[2:].strip()
1505
+ elif re.match(r'^\d+\.\s+', line_stripped):
1506
+ match_obj = re.match(r'^\d+\.\s+(.+)$', line_stripped)
1507
+ if match_obj:
1508
+ item = match_obj.group(1).strip()
1509
+
1510
+ if item:
1511
+ item = re.sub(r'\[\s*ํŽ˜์ด์ง€\s*\d+\s*\]', '', item).strip()
1512
+ item = re.sub(r'\*\*([^*]+)\*\*', r'\1', item).strip()
1513
+ item = re.sub(r'\([""""][^)]+[""""\)]+', '', item).strip()
1514
+ item = re.sub(r'\s*\([^)]{0,50}\)\s*$', '', item).strip()
1515
+
1516
+ if 3 <= len(item) <= 200:
1517
+ list_matches.append((item, int(page_num)))
1518
+ print(f" โœ“ [๋ฆฌ์ŠคํŠธ] {item[:50]}...")
1519
+
1520
+ print(f"\n{'='*40}")
1521
+ print(f"๐Ÿ“ ์ธ์šฉ๋ฌธ: {len(quoted_matches)}๊ฐœ")
1522
+ print(f"๐Ÿ“‹ ๋ฆฌ์ŠคํŠธ: {len(list_matches)}๊ฐœ")
1523
+ print(f"{'='*40}\n")
1524
+
1525
+ # ์šฐ์„ ์ˆœ์œ„
1526
+ all_matches = []
1527
+
1528
+ if quoted_matches and list_matches:
1529
+ all_short = all(len(q[0]) <= 30 for q in quoted_matches)
1530
+ if all_short:
1531
+ print(f"โœ“ ์งง์€ ์ธ์šฉ๋ฌธ + ๋ฆฌ์ŠคํŠธ ๋ชจ๋‘")
1532
+ all_matches = quoted_matches + list_matches
1533
+ else:
1534
+ print(f"โœ“ ์ธ์šฉ๋ฌธ๋งŒ")
1535
+ all_matches = quoted_matches
1536
+ elif quoted_matches:
1537
+ print(f"โœ“ ์ธ์šฉ๋ฌธ๋งŒ")
1538
+ all_matches = quoted_matches
1539
+ elif list_matches:
1540
+ print(f"โœ“ ๋ฆฌ์ŠคํŠธ๋งŒ")
1541
+ all_matches = list_matches
1542
+
1543
+ # ์ค‘๋ณต ์ œ๊ฑฐ
1544
+ seen = set()
1545
+ for text, page in all_matches:
1546
+ if text and (text, page) not in seen:
1547
+ highlights.append({
1548
+ 'text': text,
1549
+ 'page': page
1550
+ })
1551
+ seen.add((text, page))
1552
+
1553
+ print(f"\n{'='*80}")
1554
+ print(f"โœ… ์ตœ์ข… ์ถ”์ถœ: {len(highlights)}๊ฐœ")
1555
+ for i, h in enumerate(highlights, 1):
1556
+ print(f" [{i}] ํŽ˜์ด์ง€ {h['page']}: {h['text'][:60]}...")
1557
+ print(f"{'='*80}\n")
1558
 
1559
  return highlights
1560
 
1561
 
1562
+
1563
  def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0):
1564
  highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
1565
  doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
 
1580
  if (page_num + 1) in highlighted_pages:
1581
  pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">โญ ํŽ˜์ด์ง€ {page_num + 1}</div>'
1582
  else:
1583
+ pdf_html += f'<div style="background: #ADADAD; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;"> ํŽ˜์ด์ง€ {page_num + 1}</div>'
1584
 
1585
  pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />'
1586
  pdf_html += '</div>'
 
1596
  if not st.session_state.processed:
1597
  col1, col2, col3 = st.columns([1, 1, 1])
1598
  with col2:
1599
+ st.markdown("<div style='height: 30vh;'></div>", unsafe_allow_html=True)
1600
+ st.image("img/plobin-grey.png", use_container_width=True)
1601
  st.text(' ')
1602
 
1603
  with st.sidebar:
1604
+ st.image("img/plobin-right-only.png", width=85)
1605
 
1606
  uploaded_file = st.file_uploader(
1607
  "๋“œ๋ž˜๊ทธํ•˜์—ฌ ํŒŒ์ผ์„ ์—…๋กœ๋“œ ๋˜๋Š” ํด๋ฆญํ•˜์—ฌ ์„ ํƒํ•˜์„ธ์š”.",
 
1612
 
1613
  if uploaded_file:
1614
  if st.button("๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘", type="primary", use_container_width=True):
1615
+ if not GROK_API_KEY or not OPENAI_API_KEY:
1616
+ st.error("โš ๏ธ GROK_API_KEY ๋˜๋Š” OPENAI_API_KEY๊ฐ€ .env ํŒŒ์ผ์— ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค!")
1617
  st.stop()
1618
 
1619
  st.session_state.vector_db = None
 
1621
  st.session_state.chat_history = []
1622
  st.session_state.current_highlights = []
1623
 
1624
+ with st.spinner("๋ฌธ์„œ ๋ถ„์„์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."):
1625
  try:
1626
  chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
1627
 
1628
+ with st.spinner("ํ•ต์‹ฌ ๋‚ด์šฉ์„ ํŒŒ์•…ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค..."):
1629
  collection, embedder = create_vector_db(chunks, metadata_list)
1630
 
1631
  st.session_state.vector_db = collection
 
1639
  "pages": len(set(m['page'] for m in metadata_list))
1640
  }
1641
 
1642
+ # ํ…์ŠคํŠธ ๋กœ์ปฌ ์ €์žฅ
1643
+ saved_file = save_extracted_text_to_file(
1644
+ chunks,
1645
+ metadata_list,
1646
+ uploaded_file.name
1647
+ )
1648
+
1649
+ st.success(f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
1650
  st.rerun()
1651
 
1652
  except Exception as e:
 
1657
  st.info(f"**{st.session_state.doc_metadata['filename']}**")
1658
  st.info(f"ํŽ˜์ด์ง€: {st.session_state.doc_metadata['pages']}")
1659
 
1660
+ # if not st.session_state.processed:
1661
+ # st.markdown("""
1662
+ # <div class="usage-guide">
1663
+ # <h2 style="text-align: center; color: #2D3748; margin-bottom: 1.5rem;">์‚ฌ์šฉ ๋ฐฉ๋ฒ•</h2>
1664
+ # <div class="guide-step">
1665
+ # <div class="step-number">1</div>
1666
+ # <div>PDF ํŒŒ์ผ์„ ์˜ฌ๋ ค์ฃผ์„ธ์š”</div>
1667
+ # </div>
1668
+ # <div class="guide-step">
1669
+ # <div class="step-number">2</div>
1670
+ # <div>๋ฌธ์„œ ์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋  ๋•Œ๊นŒ์ง€ ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”</div>
1671
+ # </div>
1672
+ # <div class="guide-step">
1673
+ # <div class="step-number">3</div>
1674
+ # <div>๋ฌธ์„œ ๋‚ด ๊ถ๊ธˆํ•œ ๋‚ด์šฉ์„ ๋ฌผ์–ด๋ณด์„ธ์š”</div>
1675
+ # </div>
1676
+ # <div class="guide-step">
1677
+ # <div class="step-number">4</div>
1678
+ # <div>AI๊ฐ€ ์ •ํ™•ํ•œ ๋‹ต๋ณ€๊ณผ ์ถœ์ฒ˜๋ฅผ ํ•จ๊ป˜ ์•Œ๋ ค๋“œ๋ ค์š”</div>
1679
+ # </div>
1680
+ # </div>
1681
+ # """, unsafe_allow_html=True)
1682
+
1683
+ if st.session_state.processed:
1684
  col1, col2 = st.columns([1, 1])
1685
 
1686
  with col1:
1687
  header_cols = st.columns([7, 1, 1.5, 1])
1688
  with header_cols[0]:
1689
+ st.markdown("### ")
1690
 
1691
  if st.session_state.pdf_bytes:
1692
  pdf_html = render_pdf_with_highlights(
 
1719
  st.session_state.scroll_to_page = None
1720
 
1721
  with col2:
1722
+ st.markdown('### ', unsafe_allow_html=True)
1723
 
1724
  chat_container = st.container(height=650)
1725
 
 
1727
  for msg_idx, msg in enumerate(st.session_state.chat_history):
1728
  with st.chat_message(msg["role"]):
1729
  st.markdown(msg["content"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1730
 
1731
+ prompt = st.chat_input("์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”...", key="chat_input")
1732
 
1733
  if prompt:
1734
  st.session_state.chat_history.append({"role": "user", "content": prompt})
1735
  st.session_state.processing_query = prompt
1736
  st.rerun()
1737
 
1738
+ # main() ํ•จ์ˆ˜ ๋‚ด๋ถ€์˜ ์งˆ๋ฌธ ์ฒ˜๋ฆฌ ๋ถ€๋ถ„
1739
  if st.session_state.processing_query:
1740
  query = st.session_state.processing_query
1741
  st.session_state.processing_query = None
1742
 
1743
+ with st.spinner("PLOBIN์ด ์ตœ์ ์˜ ๋‹ต๋ณ€์„ ์ฐพ๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค..."):
1744
  try:
1745
  search_results = hybrid_search(
1746
  query,
 
1761
  GROK_API_KEY
1762
  )
1763
 
1764
+ # โญ ์ค‘์š”: ํฐ๋”ฐ์˜ดํ‘œ ์•ˆ์˜ ํ…์ŠคํŠธ๋งŒ ์ถ”์ถœ
1765
+ print("\n" + "="*80)
1766
+ print("๋‹ต๋ณ€์—์„œ ์ธ์šฉ๋ฌธ ์ถ”์ถœ ์ค‘...")
1767
+ print("="*80)
1768
  highlights = extract_highlights_from_answer(answer)
1769
+
1770
+ # grok_result์—์„œ ์ถ”์ถœํ•œ ๊ฒƒ์€ ์‚ฌ์šฉํ•˜์ง€ ์•Š์Œ (ํ•„์š”์‹œ ์ฃผ์„ ํ•ด์ œ)
1771
+ # grok_highlights = extract_highlights_from_grok(grok_result)
1772
+ # highlights.extend(grok_highlights)
1773
+
1774
  st.session_state.current_highlights = highlights
1775
 
1776
  if grok_result and "page" in grok_result and "error" not in grok_result:
 
1778
 
1779
  chat_data = {
1780
  "role": "assistant",
1781
+ "content": answer
 
 
 
 
 
 
 
1782
  }
1783
  st.session_state.chat_history.append(chat_data)
1784
  st.rerun()
 
1793
 
1794
 
1795
  if __name__ == "__main__":
1796
+ main()