ar08 commited on
Commit
2a14ed0
·
verified ·
1 Parent(s): 2716805

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -34
app.py CHANGED
@@ -1,57 +1,43 @@
1
- # pip install gradio transformers onnxruntime optimum torch
2
 
3
  import gradio as gr
4
- import torch
5
  from transformers import AutoTokenizer
6
- from optimum.onnxruntime import ORTModelForSeq2SeqLM, ORTOptimizer, ORTQuantizer
7
- from optimum.onnxruntime.configuration import AutoOptimizationConfig
8
  import onnxruntime as ort
 
9
 
10
- # Step 1: Load & optimize the ONNX model
11
- model_name = "Rahmat82/t5-small-finetuned-summarization-xsum"
12
- model = ORTModelForSeq2SeqLM.from_pretrained(model_name, export=True)
13
-
14
- optimizer = ORTOptimizer.from_pretrained(model)
15
- opt_config = AutoOptimizationConfig.O2() # graph fusions and transformer-specific optimizations
16
- optimizer.optimize(save_dir="optimized_model", optimization_config=opt_config)
17
- optimized_model = ORTModelForSeq2SeqLM.from_pretrained("optimized_model")
18
-
19
- # Step 2: Apply dynamic INT8 quantization for CPU
20
- quantizer = ORTQuantizer.from_pretrained(optimized_model)
21
- opt_q = quantizer.quantize(
22
- save_dir="quantized_model",
23
- quantization_config=AutoOptimizationConfig.O2().quantization_config, # dynamic quant
24
- )
25
- model = ORTModelForSeq2SeqLM.from_pretrained("quantized_model")
26
-
27
- # Step 3: Set up ONNXRuntime Session options for CPU multi-threading
28
  sess_options = ort.SessionOptions()
29
- sess_options.intra_op_num_threads = min(4, torch.get_num_threads()) # 4 threads for inference
30
  sess_options.inter_op_num_threads = 1
31
  sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
32
 
33
- # Rebuild pipeline with optimized quantized model on CPU
 
 
34
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
35
- summarizer = gradio_pipeline = None
36
- from optimum.pipelines import pipeline
37
  summarizer = pipeline(
38
- task="summarization",
39
  model=model,
40
  tokenizer=tokenizer,
41
- framework="pt",
42
- ort_session_options=sess_options,
43
- device=-1,
44
  batch_size=8,
45
  )
46
 
 
47
  def summarize_text(text):
48
  text = text.strip()
49
  if not text:
50
  return "Please enter some text."
 
51
  inputs = tokenizer.encode(text, max_length=1024, truncation=True, return_tensors="pt")
52
- decoded = tokenizer.decode(inputs[0], skip_special_tokens=True)
 
53
  summary = summarizer(
54
- decoded,
55
  min_length=60,
56
  max_length=120,
57
  do_sample=False
@@ -63,8 +49,8 @@ app = gr.Interface(
63
  fn=summarize_text,
64
  inputs=gr.Textbox(lines=12, label="Input Text"),
65
  outputs=gr.Textbox(label="Summary"),
66
- title="⚙️ CPU-Optimized ONNX T5 Summarizer",
67
- description="Uses graph optimizations, INT8 quantization, and threading tweaks for fast CPU performance."
68
  )
69
 
70
  if __name__ == "__main__":
 
1
+ # pip install gradio transformers optimum onnxruntime onnx
2
 
3
  import gradio as gr
 
4
  from transformers import AutoTokenizer
5
+ from optimum.onnxruntime import ORTModelForSeq2SeqLM
6
+ from optimum.pipelines import pipeline
7
  import onnxruntime as ort
8
+ import torch
9
 
10
+ # CPU optimization settings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  sess_options = ort.SessionOptions()
12
+ sess_options.intra_op_num_threads = min(4, torch.get_num_threads())
13
  sess_options.inter_op_num_threads = 1
14
  sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
15
 
16
+ # Load ONNX model and tokenizer
17
+ model_name = "Rahmat82/t5-small-finetuned-summarization-xsum"
18
+ model = ORTModelForSeq2SeqLM.from_pretrained(model_name, session_options=sess_options)
19
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
20
+
21
+ # Build CPU pipeline
22
  summarizer = pipeline(
23
+ "summarization",
24
  model=model,
25
  tokenizer=tokenizer,
26
+ device=-1, # Force CPU
 
 
27
  batch_size=8,
28
  )
29
 
30
+ # Summarization function
31
  def summarize_text(text):
32
  text = text.strip()
33
  if not text:
34
  return "Please enter some text."
35
+
36
  inputs = tokenizer.encode(text, max_length=1024, truncation=True, return_tensors="pt")
37
+ input_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
38
+
39
  summary = summarizer(
40
+ input_text,
41
  min_length=60,
42
  max_length=120,
43
  do_sample=False
 
49
  fn=summarize_text,
50
  inputs=gr.Textbox(lines=12, label="Input Text"),
51
  outputs=gr.Textbox(label="Summary"),
52
+ title="⚙️ ONNX T5 Summarizer (CPU-Optimized)",
53
+ description="Fast and optimized ONNX model for summarization on CPU. No quantization warnings or deprecated cache used."
54
  )
55
 
56
  if __name__ == "__main__":