"""Credit to https://github.com/THUDM/ChatGLM2-6B/blob/main/web_demo.py while mistakes are mine """ # pylint: disable=broad-exception-caught, redefined-outer-name, missing-function-docstring, missing-module-docstring, too-many-arguments, line-too-long, invalid-name, redefined-builtin, redefined-argument-from-local # import gradio as gr # model_name = "models/THUDM/chatglm2-6b-int4" # gr.load(model_name).lauch() # %%writefile demo-4bit.py import os import time from textwrap import dedent import gradio as gr import mdtex2html import torch from loguru import logger from transformers import AutoModel, AutoTokenizer # fix timezone in Linux os.environ["TZ"] = "Asia/Shanghai" try: time.tzset() # type: ignore # pylint: disable=no-member except Exception: # Windows logger.warning("Windows, cant run time.tzset()") model_name = "THUDM/chatglm2-6b" # model_name = "THUDM/chatglm2-6b-int4" RETRY_FLAG = False tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda() # 4/8 bit # model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).quantize(4).cuda() has_cuda = torch.cuda.is_available() # has_cuda = False # force cpu if has_cuda: model = ( AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda().half() ) # 3.92G else: model = AutoModel.from_pretrained( model_name, trust_remote_code=True ).half() # .float() .half().float() model = model.eval() _ = """Override Chatbot.postprocess""" def postprocess(self, y): if y is None: return [] for i, (message, response) in enumerate(y): y[i] = ( None if message is None else mdtex2html.convert((message)), None if response is None else mdtex2html.convert(response), ) return y gr.Chatbot.postprocess = postprocess def parse_text(text): """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/""" lines = text.split("\n") lines = [line for line in lines if line != ""] count = 0 for i, line in enumerate(lines): if "```" in line: count += 1 items = line.split("`") if count % 2 == 1: lines[i] = f'
'
            else:
                lines[i] = "
" else: if i > 0: if count % 2 == 1: line = line.replace("`", r"\`") line = line.replace("<", "<") line = line.replace(">", ">") line = line.replace(" ", " ") line = line.replace("*", "*") line = line.replace("_", "_") line = line.replace("-", "-") line = line.replace(".", ".") line = line.replace("!", "!") line = line.replace("(", "(") line = line.replace(")", ")") line = line.replace("$", "$") lines[i] = "
" + line text = "".join(lines) return text def predict( RETRY_FLAG, input, chatbot, max_length, top_p, temperature, history, past_key_values ): try: chatbot.append((parse_text(input), "")) except Exception as exc: logger.error(exc) logger.debug(f"{chatbot=}") _ = """ if chatbot: chatbot[-1] = (parse_text(input), str(exc)) yield chatbot, history, past_key_values # """ yield chatbot, history, past_key_values for response, history, past_key_values in model.stream_chat( tokenizer, input, history, past_key_values=past_key_values, return_past_key_values=True, max_length=max_length, top_p=top_p, temperature=temperature, ): chatbot[-1] = (parse_text(input), parse_text(response)) yield chatbot, history, past_key_values def trans_api(input, max_length=4096, top_p=0.8, temperature=0.2): if max_length < 10: max_length = 4096 if top_p < 0.1 or top_p > 1: top_p = 0.85 if temperature <= 0 or temperature > 1: temperature = 0.01 try: res, _ = model.chat( tokenizer, input, history=[], past_key_values=None, max_length=max_length, top_p=top_p, temperature=temperature, ) # logger.debug(f"{res=} \n{_=}") except Exception as exc: logger.error(f"{exc=}") res = str(exc) return res def reset_user_input(): return gr.update(value="") def reset_state(): return [], [], None # Delete last turn def delete_last_turn(chat, history): if chat and history: chat.pop(-1) history.pop(-1) return chat, history # Regenerate response def retry_last_answer( user_input, chatbot, max_length, top_p, temperature, history, past_key_values ): if chatbot and history: # Removing the previous conversation from chat chatbot.pop(-1) # Setting up a flag to capture a retry RETRY_FLAG = True # Getting last message from user user_input = history[-1][0] # Removing bot response from the history history.pop(-1) yield from predict( RETRY_FLAG, # type: ignore user_input, chatbot, max_length, top_p, temperature, history, past_key_values, ) with gr.Blocks(title="ChatGLM2-6B-int4", theme="TogtherAI/Alex2"(text_size="sm")) as demo: # gr.HTML("""

ChatGLM2-6B-int4

""") gr.HTML( """
Duplicate SpaceTo avoid the queue and for faster inference Duplicate this Space and upgrade to GPU
""" ) with gr.Accordion("🎈 Info", open=False): _ = """ ## ChatGLM2-6B-int4 Try to refresh the browser and try again when occasionally an error occurs. With a GPU, a query takes from a few seconds to a few tens of seconds, dependent on the number of words/characters the question and responses contain. The quality of the responses varies quite a bit it seems. Even the same question with the same parameters, asked at different times, can result in quite different responses. * Low temperature: responses will be more deterministic and focused; High temperature: responses more creative. * Suggested temperatures -- translation: up to 0.3; chatting: > 0.4 * Top P controls dynamic vocabulary selection based on context. For a table of example values for different scenarios, refer to [this](https://community.openai.com/t/cheat-sheet-mastering-temperature-and-top-p-in-chatgpt-api-a-few-tips-and-tricks-on-controlling-the-creativity-deterministic-output-of-prompt-responses/172683) If the instance is not on a GPU (T4), it will be very slow. You can try to run the colab notebook [chatglm2-6b-4bit colab notebook](https://colab.research.google.com/drive/1WkF7kOjVCcBBatDHjaGkuJHnPdMWNtbW?usp=sharing) for a spin. The T4 GPU is sponsored by a community GPU grant from Huggingface. Thanks a lot! """ gr.Markdown(dedent(_)) chatbot = gr.Chatbot() with gr.Row(): with gr.Column(scale=4): with gr.Column(scale=12): user_input = gr.Textbox( show_label=False, placeholder="Input...", ).style(container=False) RETRY_FLAG = gr.Checkbox(value=False, visible=False) with gr.Column(min_width=32, scale=1): with gr.Row(): submitBtn = gr.Button("Submit", variant="primary") deleteBtn = gr.Button("Delete last turn", variant="secondary") retryBtn = gr.Button("Regenerate", variant="secondary") with gr.Column(scale=1): emptyBtn = gr.Button("Clear History") max_length = gr.Slider( 0, 32768, value=8192, step=1.0, label="Maximum length", interactive=True, ) top_p = gr.Slider( 0, 1, value=0.85, step=0.01, label="Top P", interactive=True ) temperature = gr.Slider( 0.01, 1, value=0.95, step=0.01, label="Temperature", interactive=True ) history = gr.State([]) past_key_values = gr.State(None) user_input.submit( predict, [ RETRY_FLAG, user_input, chatbot, max_length, top_p, temperature, history, past_key_values, ], [chatbot, history, past_key_values], show_progress="full", ) submitBtn.click( predict, [ RETRY_FLAG, user_input, chatbot, max_length, top_p, temperature, history, past_key_values, ], [chatbot, history, past_key_values], show_progress="full", api_name="predict", ) submitBtn.click(reset_user_input, [], [user_input]) emptyBtn.click( reset_state, outputs=[chatbot, history, past_key_values], show_progress="full" ) retryBtn.click( retry_last_answer, inputs=[ user_input, chatbot, max_length, top_p, temperature, history, past_key_values, ], # outputs = [chatbot, history, last_user_message, user_message] outputs=[chatbot, history, past_key_values], ) deleteBtn.click(delete_last_turn, [chatbot, history], [chatbot, history]) with gr.Accordion("Example inputs", open=True): etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """ examples = gr.Examples( examples = [ ["Hallo! Wie geht es dir?"], ["Wie viele Stunden braucht ein Mensch, um einen Hubschrauber zu essen?"], ["Du bist ein hilfreicher und ehrlicher Assistent. Antworte immer so hilfreich wie möglich. Wenn eine Frage keinen Sinn ergibt oder faktisch nicht stimmig ist, erkläre warum, anstatt etwas Falsches zu antworten. Wenn du die Antwort auf eine Frage nicht kennst, teile bitte keine falschen Informationen mit."], ["Ich möchte, dass du als Lehrer für gesprochenes Englisch agierst und mein Englisch verbesserst. Ich werde mit dir auf Englisch sprechen und du antwortest mir auf Englisch, um mein gesprochenes Englisch zu üben. Bitte korrigiere streng meine Grammatikfehler, Tippfehler und faktischen Fehler. Stelle mir in deiner Antwort eine Frage. Jetzt lass uns üben, du könntest mir zuerst eine Frage stellen. Denke daran, meine Grammatikfehler, Tippfehler und faktischen Fehler streng zu korrigieren."], [f"Ich möchte, dass du dich wie {{Charakter}} aus {{Serie}} verhältst. Ich möchte, dass du antwortest und reagierst wie {{Charakter}}, unter Verwendung des Tons, der Manier und des Vokabulars, das {{Charakter}} verwenden würde. Schreibe keine Erklärungen. Antworte nur wie {{Charakter}}. Du musst das gesamte Wissen von {{Charakter}} kennen."] ] ], inputs=[user_input], examples_per_page=30, ) with gr.Accordion("For Chat/Translation API", open=False, visible=False): input_text = gr.Text() tr_btn = gr.Button("Go", variant="primary") out_text = gr.Text() tr_btn.click( trans_api, [input_text, max_length, top_p, temperature], out_text, # show_progress="full", api_name="tr", ) _ = """ input_text.submit( trans_api, [input_text, max_length, top_p, temperature], out_text, show_progress="full", api_name="tr1", ) # """ # demo.queue().launch(share=False, inbrowser=True) # demo.queue().launch(share=True, inbrowser=True, debug=True) demo.queue().launch(debug=True)