From 082888cbf5805c3fe0b210429129211d2d119397 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 19:22:38 -0400 Subject: [PATCH 01/13] from langchain docs --- run_localGPT.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index e0d97a5..8915f7e 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -7,6 +7,8 @@ from huggingface_hub import hf_hub_download from langchain.chains import RetrievalQA from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.llms import HuggingFacePipeline, LlamaCpp +from langchain.callbacks.manager import CallbackManager +from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma @@ -61,7 +63,17 @@ def load_model(device_type, model_id, model_basename=None): else: # The code supports all huggingface models that ends with GPTQ and have some variation # of .no-act.order or .safetensors in their HF repo. - logging.info("Using AutoGPTQForCausalLM for quantized models") + logging.info("Using GGML for quantized models") + n_gpu_layers = 40 # Change this value based on your model and your GPU VRAM pool. + n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. + callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) + return LlamaCpp( + model_path="./ggml-model-q4_0.bin", + n_gpu_layers=n_gpu_layers, + n_batch=n_batch, + verbose=True, + callback_manager=callback_manager + ) if ".safetensors" in model_basename: # Remove the ".safetensors" ending if present @@ -219,7 +231,7 @@ def main(device_type, show_sources): # model_id = "TheBloke/orca_mini_3B-GGML" # model_basename = "orca-mini-3b.ggmlv3.q4_0.bin" - model_id="TheBloke/Llama-2-7B-Chat-GGML" + model_id = "TheBloke/Llama-2-7B-Chat-GGML" model_basename = "llama-2-7b-chat.ggmlv3.q4_0.bin" llm = load_model(device_type, model_id=model_id, model_basename=model_basename) From a5822057f8cbb2cdd7d1ace994bf319dd59ea0a8 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 19:26:21 -0400 Subject: [PATCH 02/13] add hf_hub_download --- run_localGPT.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/run_localGPT.py b/run_localGPT.py index 8915f7e..ef3481c 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -67,8 +67,9 @@ def load_model(device_type, model_id, model_basename=None): n_gpu_layers = 40 # Change this value based on your model and your GPU VRAM pool. n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) + model_path = hf_hub_download(repo_id=model_id, filename=model_basename) return LlamaCpp( - model_path="./ggml-model-q4_0.bin", + model_path=model_path, n_gpu_layers=n_gpu_layers, n_batch=n_batch, verbose=True, From ee17465b8c7375cc8e9411055e7e81c7633e8315 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 19:34:12 -0400 Subject: [PATCH 03/13] change context for llama 2 --- run_localGPT.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index ef3481c..295107b 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -64,11 +64,12 @@ def load_model(device_type, model_id, model_basename=None): # The code supports all huggingface models that ends with GPTQ and have some variation # of .no-act.order or .safetensors in their HF repo. logging.info("Using GGML for quantized models") - n_gpu_layers = 40 # Change this value based on your model and your GPU VRAM pool. - n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. + n_gpu_layers = 1000 # Change this value based on your model and your GPU VRAM pool. + n_batch = 4096 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) model_path = hf_hub_download(repo_id=model_id, filename=model_basename) return LlamaCpp( + n_ctx=4096, model_path=model_path, n_gpu_layers=n_gpu_layers, n_batch=n_batch, From a7959e90d03b5caba5b1b19e315c9637d1e683d7 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:11:03 -0400 Subject: [PATCH 04/13] try partial --- run_localGPT.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index 295107b..6b010f2 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -9,6 +9,7 @@ from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.llms import HuggingFacePipeline, LlamaCpp from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler +from functools import partial # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma @@ -46,19 +47,21 @@ def load_model(device_type, model_id, model_basename=None): logging.info("This action can take a few minutes!") if model_basename is not None: - if device_type.lower() in ["cpu", "mps"]: - logging.info("Using Llamacpp for quantized models") + if ".ggml" in model_basename: + logging.info("Using Llamacpp for GGML quantized models") model_path = hf_hub_download(repo_id=model_id, filename=model_basename) + kwargs = { + "model_path": model_path, + "n_ctx": 4096, + "max_tokens": 4096, + "temperature": 0, + "repeat_penalty": 1.15 + } if device_type.lower() == "mps": - return LlamaCpp( - model_path=model_path, - n_ctx=2048, - max_tokens=2048, - temperature=0, - repeat_penalty=1.15, - n_gpu_layers=1000, - ) - return LlamaCpp(model_path=model_path, n_ctx=2048, max_tokens=2048, temperature=0, repeat_penalty=1.15) + kwargs["n_gpu_layers"] = 1000 + if device_type.lower() == "cuda": + kwargs["n_batch"] = 4096 + return partial(LlamaCpp, **kwargs) else: # The code supports all huggingface models that ends with GPTQ and have some variation From faa4dc956a087bd9b2afbd4ad9326eb0efa454b3 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:16:05 -0400 Subject: [PATCH 05/13] remove unnecessary partial --- run_localGPT.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index 6b010f2..b7d34a8 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -9,7 +9,6 @@ from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.llms import HuggingFacePipeline, LlamaCpp from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -from functools import partial # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma @@ -61,7 +60,7 @@ def load_model(device_type, model_id, model_basename=None): kwargs["n_gpu_layers"] = 1000 if device_type.lower() == "cuda": kwargs["n_batch"] = 4096 - return partial(LlamaCpp, **kwargs) + return LlamaCpp(**kwargs) else: # The code supports all huggingface models that ends with GPTQ and have some variation From 002099b7e7cfcb752cc3027fc14cf73120503598 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:20:44 -0400 Subject: [PATCH 06/13] stream] --- run_localGPT.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index b7d34a8..79d2aa0 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -49,12 +49,14 @@ def load_model(device_type, model_id, model_basename=None): if ".ggml" in model_basename: logging.info("Using Llamacpp for GGML quantized models") model_path = hf_hub_download(repo_id=model_id, filename=model_basename) + callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) kwargs = { "model_path": model_path, "n_ctx": 4096, "max_tokens": 4096, "temperature": 0, - "repeat_penalty": 1.15 + "repeat_penalty": 1.15, + callback_manager: callback_manager } if device_type.lower() == "mps": kwargs["n_gpu_layers"] = 1000 @@ -68,7 +70,6 @@ def load_model(device_type, model_id, model_basename=None): logging.info("Using GGML for quantized models") n_gpu_layers = 1000 # Change this value based on your model and your GPU VRAM pool. n_batch = 4096 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. - callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) model_path = hf_hub_download(repo_id=model_id, filename=model_basename) return LlamaCpp( n_ctx=4096, From 65515af4fcbcb26074d7b2c3097fb45fc91d7e1f Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:22:12 -0400 Subject: [PATCH 07/13] fix whoops --- run_localGPT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_localGPT.py b/run_localGPT.py index 79d2aa0..b68aab5 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -56,7 +56,7 @@ def load_model(device_type, model_id, model_basename=None): "max_tokens": 4096, "temperature": 0, "repeat_penalty": 1.15, - callback_manager: callback_manager + "callback_manager": callback_manager } if device_type.lower() == "mps": kwargs["n_gpu_layers"] = 1000 From 29048219ee4ecd1418aa304950aed57d5b4533b3 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:26:54 -0400 Subject: [PATCH 08/13] testing --- run_localGPT.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/run_localGPT.py b/run_localGPT.py index b68aab5..bbe48e0 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -62,7 +62,14 @@ def load_model(device_type, model_id, model_basename=None): kwargs["n_gpu_layers"] = 1000 if device_type.lower() == "cuda": kwargs["n_batch"] = 4096 - return LlamaCpp(**kwargs) + return LlamaCpp( + n_ctx=4096, + model_path=model_path, + n_gpu_layers=n_gpu_layers, + n_batch=n_batch, + verbose=True, + callback_manager=callback_manager + ) else: # The code supports all huggingface models that ends with GPTQ and have some variation From 5aa3d40c5ef27321515d75d088614fd5754422b2 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:27:45 -0400 Subject: [PATCH 09/13] test --- run_localGPT.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index bbe48e0..c6379e2 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -65,8 +65,8 @@ def load_model(device_type, model_id, model_basename=None): return LlamaCpp( n_ctx=4096, model_path=model_path, - n_gpu_layers=n_gpu_layers, - n_batch=n_batch, + n_gpu_layers=1000, + n_batch=4096, verbose=True, callback_manager=callback_manager ) From 926a8c98726d1e4f1ad6754fa87125d63e7f1d99 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:42:16 -0400 Subject: [PATCH 10/13] fix big whoops --- run_localGPT.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index c6379e2..697632c 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -61,15 +61,9 @@ def load_model(device_type, model_id, model_basename=None): if device_type.lower() == "mps": kwargs["n_gpu_layers"] = 1000 if device_type.lower() == "cuda": + kwargs["n_gpu_layers"] = 1000 kwargs["n_batch"] = 4096 - return LlamaCpp( - n_ctx=4096, - model_path=model_path, - n_gpu_layers=1000, - n_batch=4096, - verbose=True, - callback_manager=callback_manager - ) + return LlamaCpp(**kwargs) else: # The code supports all huggingface models that ends with GPTQ and have some variation From 808f689700399b626fd006910e6b854b944caed4 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:47:12 -0400 Subject: [PATCH 11/13] cleanup --- run_localGPT.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index 697632c..2ffb6ae 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -8,7 +8,6 @@ from langchain.chains import RetrievalQA from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.llms import HuggingFacePipeline, LlamaCpp from langchain.callbacks.manager import CallbackManager -from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma @@ -49,14 +48,12 @@ def load_model(device_type, model_id, model_basename=None): if ".ggml" in model_basename: logging.info("Using Llamacpp for GGML quantized models") model_path = hf_hub_download(repo_id=model_id, filename=model_basename) - callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) kwargs = { "model_path": model_path, "n_ctx": 4096, "max_tokens": 4096, "temperature": 0, "repeat_penalty": 1.15, - "callback_manager": callback_manager } if device_type.lower() == "mps": kwargs["n_gpu_layers"] = 1000 @@ -68,18 +65,7 @@ def load_model(device_type, model_id, model_basename=None): else: # The code supports all huggingface models that ends with GPTQ and have some variation # of .no-act.order or .safetensors in their HF repo. - logging.info("Using GGML for quantized models") - n_gpu_layers = 1000 # Change this value based on your model and your GPU VRAM pool. - n_batch = 4096 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. - model_path = hf_hub_download(repo_id=model_id, filename=model_basename) - return LlamaCpp( - n_ctx=4096, - model_path=model_path, - n_gpu_layers=n_gpu_layers, - n_batch=n_batch, - verbose=True, - callback_manager=callback_manager - ) + logging.info("Using AutoGPTQForCausalLM for quantized models") if ".safetensors" in model_basename: # Remove the ".safetensors" ending if present From b660fb05971fd186da6c4020b16073b9c17ad46d Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 20:50:37 -0400 Subject: [PATCH 12/13] reduce ctx for llama 1 models --- run_localGPT.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/run_localGPT.py b/run_localGPT.py index 2ffb6ae..6d951eb 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -7,7 +7,6 @@ from huggingface_hub import hf_hub_download from langchain.chains import RetrievalQA from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.llms import HuggingFacePipeline, LlamaCpp -from langchain.callbacks.manager import CallbackManager # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma @@ -48,18 +47,17 @@ def load_model(device_type, model_id, model_basename=None): if ".ggml" in model_basename: logging.info("Using Llamacpp for GGML quantized models") model_path = hf_hub_download(repo_id=model_id, filename=model_basename) + max_ctx_size = 2048 kwargs = { "model_path": model_path, - "n_ctx": 4096, - "max_tokens": 4096, - "temperature": 0, - "repeat_penalty": 1.15, + "n_ctx": max_ctx_size, + "max_tokens": max_ctx_size, } if device_type.lower() == "mps": kwargs["n_gpu_layers"] = 1000 if device_type.lower() == "cuda": kwargs["n_gpu_layers"] = 1000 - kwargs["n_batch"] = 4096 + kwargs["n_batch"] = max_ctx_size return LlamaCpp(**kwargs) else: From d5c71528740a18dd3957479d8dde86728b893fdf Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sun, 23 Jul 2023 21:10:32 -0400 Subject: [PATCH 13/13] update README --- README.md | 73 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 0f39f4f..24ad90e 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Install conda conda create -n localGPT ``` -Activate +Activate ```shell conda activate localGPT @@ -30,6 +30,13 @@ In order to set your environment up to run the code here, first install all requ pip install -r requirements.txt ``` +If you want to use BLAS or Metal with [llama-cpp](<(https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)>) you can set appropriate flags: + +```shell +# Example: cuBLAS +CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install -r requirements.txt +``` + Then install AutoGPTQ - if you want to run quantized models for GPU ```shell @@ -57,7 +64,7 @@ Run the following command to ingest all the data. `defaults to cuda` ```shell -python ingest.py +python ingest.py ``` Use the device type argument to specify a given device. @@ -260,40 +267,40 @@ Follow this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on- This is a test project to validate the feasibility of a fully local solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. Vicuna-7B is based on the Llama model so that has the original Llama license. - - # Common Errors - - [Torch not compatible with cuda enabled](https://github.com/pytorch/pytorch/issues/30664) +- [Torch not compatible with cuda enabled](https://github.com/pytorch/pytorch/issues/30664) - - Get cuda version + - Get cuda version + + ```shell + nvcc --version + ``` + + ```shell + nvidia-smi + ``` + + - Try Install pytorch fepending on your cuda version + ```shell + conda install -c pytorch torchvision cudatoolkit=10.1 pytorch + ``` + - If doesn't work try re installing + ```shell + pip uninstall torch + pip cache purge + pip install torch -f https://download.pytorch.org/whl/torch_stable.html + ``` - ```shell - nvcc --version - ``` - ```shell - nvidia-smi - ``` - - Try Install pytorch fepending on your cuda version - ```shell - conda install -c pytorch torchvision cudatoolkit=10.1 pytorch - ``` - - If doesn't work try re installing - ```shell - pip uninstall torch - pip cache purge - pip install torch -f https://download.pytorch.org/whl/torch_stable.html - ``` - [ERROR: pip's dependency resolver does not currently take into account all the packages that are installed](https://stackoverflow.com/questions/72672196/error-pips-dependency-resolver-does-not-currently-take-into-account-all-the-pa/76604141#76604141) - ```shell - pip install h5py - pip install typing-extensions - pip install wheel - ``` + ```shell + pip install h5py + pip install typing-extensions + pip install wheel + ``` - [Failed to import transformers](https://github.com/huggingface/transformers/issues/11262) - - Try re-install - ```shell - conda uninstall tokenizers, transformers - pip install transformers - ``` - + - Try re-install + ```shell + conda uninstall tokenizers, transformers + pip install transformers + ```