Upload README.md (#2)

- Upload README.md (0cb5b6e777b0b8614bfd9a96598e1c6b2a5ce6f4)
- Upload README.md (c4086b6db5703a9e470b1169c6d50034a61b800b)
- Upload README.md (508c92b25eed2742376dfef741e7defd580d0af6)

Files changed (1) hide show

README.md +30 -5

README.md CHANGED Viewed

@@ -86,7 +86,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 model_dir = "internlm/internlm3-8b-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
-# model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.float16)
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
   # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
   # pip install -U bitsandbytes
@@ -108,6 +108,8 @@ generated_ids = model.generate(tokenized_chat, max_new_tokens=1024, temperature=
 generated_ids = [
     output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
 response = tokenizer.batch_decode(generated_ids)[0]
 print(response)
 ```
@@ -153,6 +155,10 @@ Find more details in the [LMDeploy documentation](https://lmdeploy.readthedocs.i
 #### vLLM inference
 We are still working on merging the PR(https://github.com/vllm-project/vllm/pull/12037) into vLLM. In the meantime, please use the following PR link to install it manually.
@@ -280,6 +286,8 @@ generated_ids = model.generate(tokenized_chat, max_new_tokens=8192)
 generated_ids = [
     output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
 response = tokenizer.batch_decode(generated_ids)[0]
 print(response)
 ```
@@ -308,6 +316,10 @@ response = pipe(messages, gen_config=GenerationConfig(max_new_tokens=2048))
 print(response)
 ```
 #### vLLM inference
 We are still working on merging the PR(https://github.com/vllm-project/vllm/pull/12037) into vLLM. In the meantime, please use the following PR link to install it manually.
@@ -345,7 +357,7 @@ print(outputs)
 ## Open Source License
-The code is licensed under Apache-2.0, while model weights are fully open for academic research and also allow **free** commercial usage. To apply for a commercial license, please fill in the [application form (English)](https://wj.qq.com/s2/12727483/5dba/)/[申请表（中文）](https://wj.qq.com/s2/12725412/f7c1/). For other questions or collaborations, please contact <[email protected]>.
 ## Citation
@@ -369,7 +381,7 @@ The code is licensed under Apache-2.0, while model weights are fully open for ac
 InternLM3，即书生·浦语大模型第3代，开源了80亿参数，面向通用使用与高阶推理的指令模型（InternLM3-8B-Instruct）。模型具备以下特点：
 - **更低的代价取得更高的性能**:
-在推理、知识类任务上取得同量级最优性能，超过Llama3.1-8B和Qwen2.5-7B. 值得关注的是InternLM3只用了4万亿词元进行训练，对比同级别模型训练成本节省75%以上。
 - **深度思考能力**:
 InternLM3支持通过长思维链求解复杂推理任务的深度思考模式，同时还兼顾了用户体验更流畅的通用回复模式。
@@ -423,7 +435,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 model_dir = "internlm/internlm3-8b-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
-# model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.float16)
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
   # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
   # pip install -U bitsandbytes
@@ -445,6 +457,8 @@ generated_ids = model.generate(tokenized_chat, max_new_tokens=1024, temperature=
 generated_ids = [
     output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
 response = tokenizer.batch_decode(generated_ids)[0]
 print(response)
 ```
@@ -491,7 +505,12 @@ curl http://localhost:23333/v1/chat/completions \
 ##### vLLM 推理
 我们还在推动PR(https://github.com/vllm-project/vllm/pull/12037) 合入vllm，现在请使用以下PR链接手动安装
 ```python
@@ -616,6 +635,8 @@ generated_ids = model.generate(tokenized_chat, max_new_tokens=8192)
 generated_ids = [
     output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
 response = tokenizer.batch_decode(generated_ids)[0]
 print(response)
 ```
@@ -644,6 +665,10 @@ response = pipe(messages, gen_config=GenerationConfig(max_new_tokens=2048))
 print(response)
 ```
 ##### vLLM 推理
 我们还在推动PR(https://github.com/vllm-project/vllm/pull/12037) 合入vllm，现在请使用以下PR链接手动安装
@@ -687,7 +712,7 @@ print(outputs)
 ## 开源许可证
-本仓库的代码依照 Apache-2.0 协议开源。模型权重对学术研究完全开放，也可申请免费的商业使用授权（[申请表](https://wj.qq.com/s2/12725412/f7c1/)）。其他问题与合作请联系 <[email protected]>。
 ## 引用

 model_dir = "internlm/internlm3-8b-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
+model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.float16)
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
   # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
   # pip install -U bitsandbytes
 generated_ids = [
     output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
+prompt = tokenizer.batch_decode(tokenized_chat)[0]
+print(prompt)
 response = tokenizer.batch_decode(generated_ids)[0]
 print(response)
 ```
+####  Ollama inference
+TODO
 #### vLLM inference
 We are still working on merging the PR(https://github.com/vllm-project/vllm/pull/12037) into vLLM. In the meantime, please use the following PR link to install it manually.
 generated_ids = [
     output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
+prompt = tokenizer.batch_decode(tokenized_chat)[0]
+print(prompt)
 response = tokenizer.batch_decode(generated_ids)[0]
 print(response)
 ```
 print(response)
 ```
+####  Ollama inference
+TODO
 #### vLLM inference
 We are still working on merging the PR(https://github.com/vllm-project/vllm/pull/12037) into vLLM. In the meantime, please use the following PR link to install it manually.
 ## Open Source License
+Code and model weights are licensed under Apache-2.0.
 ## Citation
 InternLM3，即书生·浦语大模型第3代，开源了80亿参数，面向通用使用与高阶推理的指令模型（InternLM3-8B-Instruct）。模型具备以下特点：
 - **更低的代价取得更高的性能**:
+在推理、知识类任务上取得同量级最优性能，超过Llama3.1-8B和Qwen2.5-7B。值得关注的是InternLM3只用了4万亿词元进行训练，对比同级别模型训练成本节省75%以上。
 - **深度思考能力**:
 InternLM3支持通过长思维链求解复杂推理任务的深度思考模式，同时还兼顾了用户体验更流畅的通用回复模式。
 model_dir = "internlm/internlm3-8b-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
+model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.float16)
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
   # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
   # pip install -U bitsandbytes
 generated_ids = [
     output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
+prompt = tokenizer.batch_decode(tokenized_chat)[0]
+print(prompt)
 response = tokenizer.batch_decode(generated_ids)[0]
 print(response)
 ```
+#####  Ollama 推理
+TODO
 ##### vLLM 推理
 我们还在推动PR(https://github.com/vllm-project/vllm/pull/12037) 合入vllm，现在请使用以下PR链接手动安装
 ```python
 generated_ids = [
     output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
 ]
+prompt = tokenizer.batch_decode(tokenized_chat)[0]
+print(prompt)
 response = tokenizer.batch_decode(generated_ids)[0]
 print(response)
 ```
 print(response)
 ```
+#####  Ollama 推理
+TODO
 ##### vLLM 推理
 我们还在推动PR(https://github.com/vllm-project/vllm/pull/12037) 合入vllm，现在请使用以下PR链接手动安装
 ## 开源许可证
+本仓库的代码和权重依照 Apache-2.0 协议开源。
 ## 引用