From 082888cbf5805c3fe0b210429129211d2d119397 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 19:22:38 -0400
Subject: [PATCH 01/13] from langchain docs

---
 run_localGPT.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index e0d97a5..8915f7e 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -7,6 +7,8 @@ from huggingface_hub import hf_hub_download
 from langchain.chains import RetrievalQA
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.llms import HuggingFacePipeline, LlamaCpp
+from langchain.callbacks.manager import CallbackManager
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 
 # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
@@ -61,7 +63,17 @@ def load_model(device_type, model_id, model_basename=None):
         else:
             # The code supports all huggingface models that ends with GPTQ and have some variation
             # of .no-act.order or .safetensors in their HF repo.
-            logging.info("Using AutoGPTQForCausalLM for quantized models")
+            logging.info("Using GGML for quantized models")
+            n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
+            n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
+            callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+            return LlamaCpp(
+                model_path="./ggml-model-q4_0.bin",
+                n_gpu_layers=n_gpu_layers,
+                n_batch=n_batch,
+                verbose=True,
+                callback_manager=callback_manager
+            )
 
             if ".safetensors" in model_basename:
                 # Remove the ".safetensors" ending if present
@@ -219,7 +231,7 @@ def main(device_type, show_sources):
     # model_id = "TheBloke/orca_mini_3B-GGML"
     # model_basename = "orca-mini-3b.ggmlv3.q4_0.bin"
 
-    model_id="TheBloke/Llama-2-7B-Chat-GGML"
+    model_id = "TheBloke/Llama-2-7B-Chat-GGML"
     model_basename = "llama-2-7b-chat.ggmlv3.q4_0.bin"
 
     llm = load_model(device_type, model_id=model_id, model_basename=model_basename)

From a5822057f8cbb2cdd7d1ace994bf319dd59ea0a8 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 19:26:21 -0400
Subject: [PATCH 02/13] add hf_hub_download

---
 run_localGPT.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 8915f7e..ef3481c 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -67,8 +67,9 @@ def load_model(device_type, model_id, model_basename=None):
             n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
             n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+            model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
             return LlamaCpp(
-                model_path="./ggml-model-q4_0.bin",
+                model_path=model_path,
                 n_gpu_layers=n_gpu_layers,
                 n_batch=n_batch,
                 verbose=True,

From ee17465b8c7375cc8e9411055e7e81c7633e8315 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 19:34:12 -0400
Subject: [PATCH 03/13] change context for llama 2

---
 run_localGPT.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index ef3481c..295107b 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -64,11 +64,12 @@ def load_model(device_type, model_id, model_basename=None):
             # The code supports all huggingface models that ends with GPTQ and have some variation
             # of .no-act.order or .safetensors in their HF repo.
             logging.info("Using GGML for quantized models")
-            n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
-            n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
+            n_gpu_layers = 1000  # Change this value based on your model and your GPU VRAM pool.
+            n_batch = 4096  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
             model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
             return LlamaCpp(
+                n_ctx=4096,
                 model_path=model_path,
                 n_gpu_layers=n_gpu_layers,
                 n_batch=n_batch,

From a7959e90d03b5caba5b1b19e315c9637d1e683d7 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:11:03 -0400
Subject: [PATCH 04/13] try partial

---
 run_localGPT.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 295107b..6b010f2 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -9,6 +9,7 @@ from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.llms import HuggingFacePipeline, LlamaCpp
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from functools import partial
 
 # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
@@ -46,19 +47,21 @@ def load_model(device_type, model_id, model_basename=None):
     logging.info("This action can take a few minutes!")
 
     if model_basename is not None:
-        if device_type.lower() in ["cpu", "mps"]:
-            logging.info("Using Llamacpp for quantized models")
+        if ".ggml" in model_basename:
+            logging.info("Using Llamacpp for GGML quantized models")
             model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
+            kwargs = {
+                "model_path": model_path,
+                "n_ctx": 4096,
+                "max_tokens": 4096,
+                "temperature": 0,
+                "repeat_penalty": 1.15
+            }
             if device_type.lower() == "mps":
-                return LlamaCpp(
-                    model_path=model_path,
-                    n_ctx=2048,
-                    max_tokens=2048,
-                    temperature=0,
-                    repeat_penalty=1.15,
-                    n_gpu_layers=1000,
-                )
-            return LlamaCpp(model_path=model_path, n_ctx=2048, max_tokens=2048, temperature=0, repeat_penalty=1.15)
+                kwargs["n_gpu_layers"] = 1000
+            if device_type.lower() == "cuda":
+                kwargs["n_batch"] = 4096
+            return partial(LlamaCpp, **kwargs)
 
         else:
             # The code supports all huggingface models that ends with GPTQ and have some variation

From faa4dc956a087bd9b2afbd4ad9326eb0efa454b3 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:16:05 -0400
Subject: [PATCH 05/13] remove unnecessary partial

---
 run_localGPT.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 6b010f2..b7d34a8 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -9,7 +9,6 @@ from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.llms import HuggingFacePipeline, LlamaCpp
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from functools import partial
 
 # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
@@ -61,7 +60,7 @@ def load_model(device_type, model_id, model_basename=None):
                 kwargs["n_gpu_layers"] = 1000
             if device_type.lower() == "cuda":
                 kwargs["n_batch"] = 4096
-            return partial(LlamaCpp, **kwargs)
+            return LlamaCpp(**kwargs)
 
         else:
             # The code supports all huggingface models that ends with GPTQ and have some variation

From 002099b7e7cfcb752cc3027fc14cf73120503598 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:20:44 -0400
Subject: [PATCH 06/13] stream]

---
 run_localGPT.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index b7d34a8..79d2aa0 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -49,12 +49,14 @@ def load_model(device_type, model_id, model_basename=None):
         if ".ggml" in model_basename:
             logging.info("Using Llamacpp for GGML quantized models")
             model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
+            callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
             kwargs = {
                 "model_path": model_path,
                 "n_ctx": 4096,
                 "max_tokens": 4096,
                 "temperature": 0,
-                "repeat_penalty": 1.15
+                "repeat_penalty": 1.15,
+                callback_manager: callback_manager
             }
             if device_type.lower() == "mps":
                 kwargs["n_gpu_layers"] = 1000
@@ -68,7 +70,6 @@ def load_model(device_type, model_id, model_basename=None):
             logging.info("Using GGML for quantized models")
             n_gpu_layers = 1000  # Change this value based on your model and your GPU VRAM pool.
             n_batch = 4096  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
-            callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
             model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
             return LlamaCpp(
                 n_ctx=4096,

From 65515af4fcbcb26074d7b2c3097fb45fc91d7e1f Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:22:12 -0400
Subject: [PATCH 07/13] fix whoops

---
 run_localGPT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 79d2aa0..b68aab5 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -56,7 +56,7 @@ def load_model(device_type, model_id, model_basename=None):
                 "max_tokens": 4096,
                 "temperature": 0,
                 "repeat_penalty": 1.15,
-                callback_manager: callback_manager
+                "callback_manager": callback_manager
             }
             if device_type.lower() == "mps":
                 kwargs["n_gpu_layers"] = 1000

From 29048219ee4ecd1418aa304950aed57d5b4533b3 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:26:54 -0400
Subject: [PATCH 08/13] testing

---
 run_localGPT.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index b68aab5..bbe48e0 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -62,7 +62,14 @@ def load_model(device_type, model_id, model_basename=None):
                 kwargs["n_gpu_layers"] = 1000
             if device_type.lower() == "cuda":
                 kwargs["n_batch"] = 4096
-            return LlamaCpp(**kwargs)
+            return LlamaCpp(
+                n_ctx=4096,
+                model_path=model_path,
+                n_gpu_layers=n_gpu_layers,
+                n_batch=n_batch,
+                verbose=True,
+                callback_manager=callback_manager
+            )
 
         else:
             # The code supports all huggingface models that ends with GPTQ and have some variation

From 5aa3d40c5ef27321515d75d088614fd5754422b2 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:27:45 -0400
Subject: [PATCH 09/13] test

---
 run_localGPT.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index bbe48e0..c6379e2 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -65,8 +65,8 @@ def load_model(device_type, model_id, model_basename=None):
             return LlamaCpp(
                 n_ctx=4096,
                 model_path=model_path,
-                n_gpu_layers=n_gpu_layers,
-                n_batch=n_batch,
+                n_gpu_layers=1000,
+                n_batch=4096,
                 verbose=True,
                 callback_manager=callback_manager
             )

From 926a8c98726d1e4f1ad6754fa87125d63e7f1d99 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:42:16 -0400
Subject: [PATCH 10/13] fix big whoops

---
 run_localGPT.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index c6379e2..697632c 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -61,15 +61,9 @@ def load_model(device_type, model_id, model_basename=None):
             if device_type.lower() == "mps":
                 kwargs["n_gpu_layers"] = 1000
             if device_type.lower() == "cuda":
+                kwargs["n_gpu_layers"] = 1000
                 kwargs["n_batch"] = 4096
-            return LlamaCpp(
-                n_ctx=4096,
-                model_path=model_path,
-                n_gpu_layers=1000,
-                n_batch=4096,
-                verbose=True,
-                callback_manager=callback_manager
-            )
+            return LlamaCpp(**kwargs)
 
         else:
             # The code supports all huggingface models that ends with GPTQ and have some variation

From 808f689700399b626fd006910e6b854b944caed4 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:47:12 -0400
Subject: [PATCH 11/13] cleanup

---
 run_localGPT.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 697632c..2ffb6ae 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -8,7 +8,6 @@ from langchain.chains import RetrievalQA
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.llms import HuggingFacePipeline, LlamaCpp
 from langchain.callbacks.manager import CallbackManager
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 
 # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
@@ -49,14 +48,12 @@ def load_model(device_type, model_id, model_basename=None):
         if ".ggml" in model_basename:
             logging.info("Using Llamacpp for GGML quantized models")
             model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
-            callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
             kwargs = {
                 "model_path": model_path,
                 "n_ctx": 4096,
                 "max_tokens": 4096,
                 "temperature": 0,
                 "repeat_penalty": 1.15,
-                "callback_manager": callback_manager
             }
             if device_type.lower() == "mps":
                 kwargs["n_gpu_layers"] = 1000
@@ -68,18 +65,7 @@ def load_model(device_type, model_id, model_basename=None):
         else:
             # The code supports all huggingface models that ends with GPTQ and have some variation
             # of .no-act.order or .safetensors in their HF repo.
-            logging.info("Using GGML for quantized models")
-            n_gpu_layers = 1000  # Change this value based on your model and your GPU VRAM pool.
-            n_batch = 4096  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
-            model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
-            return LlamaCpp(
-                n_ctx=4096,
-                model_path=model_path,
-                n_gpu_layers=n_gpu_layers,
-                n_batch=n_batch,
-                verbose=True,
-                callback_manager=callback_manager
-            )
+            logging.info("Using AutoGPTQForCausalLM for quantized models")
 
             if ".safetensors" in model_basename:
                 # Remove the ".safetensors" ending if present

From b660fb05971fd186da6c4020b16073b9c17ad46d Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 20:50:37 -0400
Subject: [PATCH 12/13] reduce ctx for llama 1 models

---
 run_localGPT.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 2ffb6ae..6d951eb 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -7,7 +7,6 @@ from huggingface_hub import hf_hub_download
 from langchain.chains import RetrievalQA
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.llms import HuggingFacePipeline, LlamaCpp
-from langchain.callbacks.manager import CallbackManager
 
 # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
@@ -48,18 +47,17 @@ def load_model(device_type, model_id, model_basename=None):
         if ".ggml" in model_basename:
             logging.info("Using Llamacpp for GGML quantized models")
             model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
+            max_ctx_size = 2048
             kwargs = {
                 "model_path": model_path,
-                "n_ctx": 4096,
-                "max_tokens": 4096,
-                "temperature": 0,
-                "repeat_penalty": 1.15,
+                "n_ctx": max_ctx_size,
+                "max_tokens": max_ctx_size,
             }
             if device_type.lower() == "mps":
                 kwargs["n_gpu_layers"] = 1000
             if device_type.lower() == "cuda":
                 kwargs["n_gpu_layers"] = 1000
-                kwargs["n_batch"] = 4096
+                kwargs["n_batch"] = max_ctx_size
             return LlamaCpp(**kwargs)
 
         else:

From d5c71528740a18dd3957479d8dde86728b893fdf Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <ngjaywa@gmail.com>
Date: Sun, 23 Jul 2023 21:10:32 -0400
Subject: [PATCH 13/13] update README

---
 README.md | 73 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 0f39f4f..24ad90e 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Install conda
 conda create -n localGPT
 ```
 
-Activate 
+Activate
 
 ```shell
 conda activate localGPT
@@ -30,6 +30,13 @@ In order to set your environment up to run the code here, first install all requ
 pip install -r requirements.txt
 ```
 
+If you want to use BLAS or Metal with [llama-cpp](<(https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)>) you can set appropriate flags:
+
+```shell
+# Example: cuBLAS
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install -r requirements.txt
+```
+
 Then install AutoGPTQ - if you want to run quantized models for GPU
 
 ```shell
@@ -57,7 +64,7 @@ Run the following command to ingest all the data.
 `defaults to cuda`
 
 ```shell
-python ingest.py 
+python ingest.py
 ```
 
 Use the device type argument to specify a given device.
@@ -260,40 +267,40 @@ Follow this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-
 
 This is a test project to validate the feasibility of a fully local solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. Vicuna-7B is based on the Llama model so that has the original Llama license.
 
-
-
 # Common Errors
 
- - [Torch not compatible with cuda enabled](https://github.com/pytorch/pytorch/issues/30664)
+- [Torch not compatible with cuda enabled](https://github.com/pytorch/pytorch/issues/30664)
 
-   -  Get cuda version
+  - Get cuda version
+
+    ```shell
+    nvcc --version
+    ```
+
+    ```shell
+    nvidia-smi
+    ```
+
+  - Try Install pytorch fepending on your cuda version
+    ```shell
+       conda install -c pytorch torchvision cudatoolkit=10.1 pytorch
+    ```
+  - If doesn't work try re installing
+    ```shell
+       pip uninstall torch
+       pip cache purge
+       pip install torch -f https://download.pytorch.org/whl/torch_stable.html
+    ```
 
-      ```shell
-      nvcc --version
-      ```
-      ```shell
-      nvidia-smi
-      ```
-   - Try Install pytorch fepending on your cuda version
-      ```shell
-         conda install -c pytorch torchvision cudatoolkit=10.1 pytorch
-      ```
-   - If doesn't work try re installing 
-      ```shell
-         pip uninstall torch
-         pip cache purge
-         pip install torch -f https://download.pytorch.org/whl/torch_stable.html
-      ```
 - [ERROR: pip's dependency resolver does not currently take into account all the packages that are installed](https://stackoverflow.com/questions/72672196/error-pips-dependency-resolver-does-not-currently-take-into-account-all-the-pa/76604141#76604141)
-   ```shell
-      pip install h5py
-      pip install typing-extensions
-      pip install wheel
-   ```
+  ```shell
+     pip install h5py
+     pip install typing-extensions
+     pip install wheel
+  ```
 - [Failed to import transformers](https://github.com/huggingface/transformers/issues/11262)
-   - Try  re-install
-      ```shell
-         conda uninstall tokenizers, transformers
-         pip install transformers
-      ```
-
+  - Try re-install
+    ```shell
+       conda uninstall tokenizers, transformers
+       pip install transformers
+    ```