-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
176 lines (149 loc) · 6.78 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from flask import Flask, render_template, request, jsonify, Response, stream_with_context
import requests
import json
from datetime import datetime
import time
app = Flask(__name__)
# Global variables to store the endpoints and system prompts
ENDPOINT1 = ""
ENDPOINT2 = ""
SYSTEM_PROMPT1 = ""
SYSTEM_PROMPT2 = ""
MODEL1 = ""
MODEL2 = ""
def get_model_name(endpoint):
try:
response = requests.get(f"{endpoint}/v1/models")
response.raise_for_status()
models = response.json().get('data', [])
if models:
return models[0].get('id', 'Unknown Model')
return 'Unknown Model'
except:
return 'Unknown Model'
def stream_response(endpoint, prompt, conversation_history):
headers = {
"Content-Type": "application/json"
}
data = {
"model": "gpt-3.5-turbo",
"messages": conversation_history + [{"role": "user", "content": prompt}],
"stream": True # We need to stream the response
}
try:
response = requests.post(f"{endpoint}/v1/chat/completions", headers=headers, json=data, stream=True)
response.raise_for_status()
total_tokens = 0 # Track number of tokens
start_time = time.time() # Time tracking
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
line = line[6:] # Remove 'data: ' prefix
if line != '[DONE]':
try:
chunk = json.loads(line)
if 'choices' in chunk:
# Count tokens generated in this chunk
content = chunk['choices'][0]['delta'].get('content', '')
token_count = len(content.split()) # Naive token estimation by word count
total_tokens += token_count
yield chunk # Yield valid responses to the frontend
except json.JSONDecodeError:
print(f"Invalid JSON: {line}")
elapsed_time = time.time() - start_time # Time taken to generate the tokens
if elapsed_time > 0: # Avoid division by zero errors
tokens_per_second = total_tokens / elapsed_time
else:
tokens_per_second = 0
# Yield a summary with token count and token per second rate
yield {"total_tokens": total_tokens, "elapsed_time": elapsed_time, "tps": tokens_per_second}
except requests.RequestException as e:
yield json.dumps({"error": str(e)})
@app.route('/')
def index():
return render_template('index.html')
@app.route('/connect', methods=['POST'])
def connect():
global ENDPOINT1, ENDPOINT2, SYSTEM_PROMPT1, SYSTEM_PROMPT2, MODEL1, MODEL2
endpoint_num = request.json['endpoint_num']
endpoint_url = request.json['endpoint_url']
system_prompt = request.json['system_prompt']
if endpoint_num == 1:
ENDPOINT1 = endpoint_url
SYSTEM_PROMPT1 = system_prompt
MODEL1 = get_model_name(ENDPOINT1)
else:
ENDPOINT2 = endpoint_url
SYSTEM_PROMPT2 = system_prompt
MODEL2 = get_model_name(ENDPOINT2)
try:
response = requests.get(endpoint_url)
if response.status_code == 200:
return jsonify({"status": "success", "message": f"Connected to Endpoint {endpoint_num}"})
else:
return jsonify({"status": "error", "message": f"Failed to connect to Endpoint {endpoint_num}"})
except requests.RequestException:
return jsonify({"status": "error", "message": f"Failed to connect to Endpoint {endpoint_num}"})
@app.route('/chat', methods=['POST'])
def chat():
if not ENDPOINT1 or not ENDPOINT2:
return jsonify({"error": "Please connect to both endpoints first"})
initial_prompt = request.json['prompt']
num_exchanges = int(request.json['num_exchanges'])
conversation_history = [
{"role": "system", "content": SYSTEM_PROMPT1},
{"role": "user", "content": initial_prompt}
]
def generate():
current_prompt = initial_prompt
for i in range(num_exchanges):
# Stream response from Model 2
yield json.dumps({"sender": "Model 2", "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "model": MODEL2}) + '\n'
full_response = ""
tokens = 0 # Track tokens from Model 2
for chunk in stream_response(ENDPOINT2, current_prompt, conversation_history):
if 'choices' in chunk:
content = chunk['choices'][0]['delta'].get('content', '')
full_response += content
yield json.dumps({"content": content}) + '\n'
if 'total_tokens' in chunk: # Pass token stats data
tokens = chunk['total_tokens']
tps = chunk['tps']
elapsed_time = chunk['elapsed_time']
# Append model stats (timestamp, tokens, t/s)
yield json.dumps({
"end": True,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"model": MODEL2,
"total_tokens": tokens,
"tps": tps
}) + '\n'
current_prompt = full_response # Prepare next response for Model 1
conversation_history.append({"role": "assistant", "content": full_response})
# Stream response from Model 1
yield json.dumps({"sender": "Model 1", "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "model": MODEL1}) + '\n'
full_response = ""
tokens = 0 # Track tokens from Model 1
for chunk in stream_response(ENDPOINT1, current_prompt, conversation_history):
if 'choices' in chunk:
content = chunk['choices'][0]['delta'].get('content', '')
full_response += content
yield json.dumps({"content": content}) + '\n'
if 'total_tokens' in chunk:
tokens = chunk['total_tokens']
tps = chunk['tps']
elapsed_time = chunk['elapsed_time']
# Append model stats (timestamp, tokens, t/s)
yield json.dumps({
"end": True,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"model": MODEL1,
"total_tokens": tokens,
"tps": tps
}) + '\n'
current_prompt = full_response # Prepare next response for Model 2
conversation_history.append({"role": "assistant", "content": full_response})
return Response(stream_with_context(generate()), content_type='application/json')
if __name__ == '__main__':
app.run(debug=True)