[Bug] InternLM2 int4 出现重复说话、重复前置内容(system prompt)现象

sanbuphy opened this issue · comments

Describe the bug

我使用了 lmdeploy 部署 InternLM2 int4的pipeline进行推理用于英文翻译,发现多次pipe后(每一次都是单独的 pipe(输入))会出现类似情况:(初始化一次pipe然后多次使用)




from lmdeploy import pipeline, TurbomindEngineConfig,GenerationConfig
class InternLM2():
    def __init__(self,model_path="",max_batch_size=1,session_len=4096):
        self.model_path = model_path
        self.model = self._load_model(max_batch_size,session_len)

    def _load_model(self,max_batch_size,session_len):
        engine_config = TurbomindEngineConfig(model_format='awq',max_batch_size=max_batch_size,session_len=session_len)
        if self.model_path is not "":
            pipe = pipeline(self.model_path, backend_config=engine_config)
            pipe = pipeline("internlm/internlm2-chat-7b-4bits", backend_config=engine_config)
        return pipe

    def infer(self,system_prompt, src_text: str,gen_config:GenerationConfig) -> str:
        response = self.model([system_prompt + src_text],gen_config)
        return response

internLM2 = InternLM2(session_len=2048)
gen_config = GenerationConfig(top_k=20,top_p=0.3,temperature=0.1)
translator_system_prompt = """

with open(translate_filename, 'w', encoding='utf-8') as file:
    for chunk in new_paragraphs:
        chunk_translate = internLM2.infer(translator_system_prompt,f"{chunk}" ,gen_config)
        print(chunk, '\n' ,chunk_translate[0].text)
            chunk_translate[0].text +'\n')



sys.platform: linux
Python: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
CUDA available: True
MUSA available: False
numpy_random_seed: 2147483648
GPU 0: NVIDIA GeForce RTX 3060
CUDA_HOME: //usr/local/cuda-12
NVCC: Cuda compilation tools, release 12.3, V12.3.52
GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
PyTorch: 2.1.0+cu121
PyTorch compiling details: PyTorch built with:

  • GCC 9.3
  • C++ Version: 201703
  • Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  • Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  • OpenMP 201511 (a.k.a. OpenMP 4.5)
  • LAPACK is enabled (usually provided by MKL)
  • NNPACK is enabled
  • CPU capability usage: AVX2
  • CUDA Runtime 12.1
  • NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90
  • CuDNN 8.9.2
  • Magma 2.6.1
  • Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=12.1, CUDNN_VERSION=8.9.2, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-aligned-allocation-unavailable -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.1.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF,

TorchVision: 0.16.0+cu121
LMDeploy: 0.2.4+f5bc455
transformers: 4.36.2
gradio: 3.36.1
fastapi: 0.109.2
pydantic: 2.6.1

Other information

# 使用 internLM2 总结文件,格式化输出到目标位置
import re
import os
from pathlib import Path
from whispertranslator.llm import InternLM2
from lmdeploy import GenerationConfig
src_path = "原文.txt"
export_dir = './'

def split_text(text, max_word_count):

    def count_words(text):
        words = re.findall(r'\b\w+\b', text)
        return len(words)

    sentences = re.split(r'(?<=[,.])\s', text)  # 按照逗号和句号分割文本
    new_paragraphs = []
    current_paragraph = ''
    current_word_count = 0

    for sentence in sentences:
        sentence_word_count = count_words(sentence)
        if current_word_count + sentence_word_count <= max_word_count:
            current_paragraph += sentence + ' '
            current_word_count += sentence_word_count
            if current_word_count > 0:
            current_paragraph = sentence + ' '
            current_word_count = sentence_word_count

    if current_paragraph != '':

    return new_paragraphs

with open(src_path,'r') as file:
    full_text =

new_paragraphs = split_text(full_text, max_word_count=150)

translate_filename = os.path.basename(src_path) + '_translate_new' + '.txt'
translate_filename = Path(export_dir) / translate_filename

internLM2 = InternLM2(session_len=2048)
gen_config = GenerationConfig(top_k=20,top_p=0.3,temperature=0.1)
translator_system_prompt = """
summary_system_prompt = f"""
with open(translate_filename, 'w', encoding='utf-8') as file:
    for chunk in new_paragraphs:
        chunk_translate = internLM2.infer(translator_system_prompt,f"{chunk}" ,gen_config)
        print(chunk, '\n' ,chunk_translate[0].text)
            chunk_translate[0].text +'\n')
# with open(chunk_filename, 'r', encoding='utf-8') as file:
#     content =
#     content = internLM2.infer(summary_system_prompt,
#                                 content)[0].text + '\n' + content
# with open(chunk_filename, "w") as file:
#     file.write(content)


重复说translator_system_prompt的问题改用这种方式试试呢?system prompt放到system的role里面,另外再强化一下指令的要求:

prompts = [[
    'role': 'system',
    'content': '把下列文字翻译成中文,只返回给我翻译结果,不要输出任何额外内容'
    'role': 'user',
    'content': '待翻译的文本'
response = self.model(prompts, gen_config)

重复说translator_system_prompt的问题改用这种方式试试呢?system prompt放到system的role里面,另外再强化一下指令的要求:

prompts = [[
    'role': 'system',
    'content': '把下列文字翻译成中文,只返回给我翻译结果,不要输出任何额外内容'
    'role': 'user',
    'content': '待翻译的文本'
response = self.model(prompts, gen_config)

仍然未改善 哭泣,还是有类似现象




我感觉可能是RLHF的时候有些过拟合了,导致模型变得过于helpful,一般表现为在回复的答案前后加过多额外的内容,没法严格遵循指令。 以及翻译名字变成书生浦语应该也是过拟合导致的,训练时候身份认知数据加太多导致“我的名字是”这几个token后面出现“书生浦语”的概率变得太高了。 chat模型实在纠正不过来的话,要不考虑换成没有rl过的chat-sft模型试试。不过我也不确定会不会变好。

感觉 ,得等下一版本?

