Merge branch 'main' into patch-1

2026-01-15 12:15:10 +00:00 · 2023-07-25 23:13:32 -07:00
parent c0c59c3fc9 99b105ba43
commit dfa6f65c16
2 changed files with 68 additions and 26 deletions
--- a/README.md
+++ b/README.md
@@ -30,6 +30,25 @@ In order to set your environment up to run the code here, first install all requ
 pip install -r requirements.txt
 ```

+
+If you want to use BLAS or Metal with [llama-cpp](<(https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)>) you can set appropriate flags:
+
+```shell
+# Example: cuBLAS
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install -r requirements.txt
+```
+
+Then install AutoGPTQ - if you want to run quantized models for GPU
+
+```shell
+git clone https://github.com/PanQiWei/AutoGPTQ.git
+cd AutoGPTQ
+git checkout v0.2.2
+pip install .
+```
+
+For more support on [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ).
+
 ## Test dataset

 This repo uses a [Constitution of USA ](https://constitutioncenter.org/media/files/constitution.pdf) as an example.
@@ -249,10 +268,31 @@ Follow this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-

 This is a test project to validate the feasibility of a fully local solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. Vicuna-7B is based on the Llama model so that has the original Llama license.

-
-
 # Common Errors

+- [Torch not compatible with cuda enabled](https://github.com/pytorch/pytorch/issues/30664)
+
+  - Get cuda version
+
+    ```shell
+    nvcc --version
+    ```
+
+    ```shell
+    nvidia-smi
+    ```
+
+  - Try Install pytorch fepending on your cuda version
+    ```shell
+       conda install -c pytorch torchvision cudatoolkit=10.1 pytorch
+    ```
+  - If doesn't work try re installing
+    ```shell
+       pip uninstall torch
+       pip cache purge
+       pip install torch -f https://download.pytorch.org/whl/torch_stable.html
+    ```
+
 - [Torch not compatible with cuda enabled](https://github.com/pytorch/pytorch/issues/30664)

   -  Get cuda version
@@ -273,16 +313,16 @@ This is a test project to validate the feasibility of a fully local solution for
         pip cache purge
         pip install torch -f https://download.pytorch.org/whl/torch_stable.html
      ```
- [ERROR: pip's dependency resolver does not currently take into account all the packages that are installed](https://stackoverflow.com/questions/72672196/error-pips-dependency-resolver-does-not-currently-take-into-account-all-the-pa/76604141#76604141)
-   ```shell
-      pip install h5py
-      pip install typing-extensions
-      pip install wheel
-   ```
- [Failed to import transformers](https://github.com/huggingface/transformers/issues/11262)
-   - Try  re-install
-      ```shell
-         conda uninstall tokenizers, transformers
-         pip install transformers
-      ```

+- [ERROR: pip's dependency resolver does not currently take into account all the packages that are installed](https://stackoverflow.com/questions/72672196/error-pips-dependency-resolver-does-not-currently-take-into-account-all-the-pa/76604141#76604141)
+  ```shell
+     pip install h5py
+     pip install typing-extensions
+     pip install wheel
+  ```
+- [Failed to import transformers](https://github.com/huggingface/transformers/issues/11262)
+  - Try re-install
+    ```shell
+       conda uninstall tokenizers, transformers
+       pip install transformers
+    ```
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -44,19 +44,21 @@ def load_model(device_type, model_id, model_basename=None):
    logging.info("This action can take a few minutes!")

    if model_basename is not None:
-        if device_type.lower() in ["cpu", "mps"]:
-            logging.info("Using Llamacpp for quantized models")
+        if ".ggml" in model_basename:
+            logging.info("Using Llamacpp for GGML quantized models")
            model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
+            max_ctx_size = 2048
+            kwargs = {
+                "model_path": model_path,
+                "n_ctx": max_ctx_size,
+                "max_tokens": max_ctx_size,
+            }
            if device_type.lower() == "mps":
-                return LlamaCpp(
-                    model_path=model_path,
-                    n_ctx=2048,
-                    max_tokens=2048,
-                    temperature=0,
-                    repeat_penalty=1.15,
-                    n_gpu_layers=1000,
-                )
-            return LlamaCpp(model_path=model_path, n_ctx=2048, max_tokens=2048, temperature=0, repeat_penalty=1.15)
+                kwargs["n_gpu_layers"] = 1000
+            if device_type.lower() == "cuda":
+                kwargs["n_gpu_layers"] = 1000
+                kwargs["n_batch"] = max_ctx_size
+            return LlamaCpp(**kwargs)

        else:
            # The code supports all huggingface models that ends with GPTQ and have some variation
@@ -219,7 +221,7 @@ def main(device_type, show_sources):
    # model_id = "TheBloke/orca_mini_3B-GGML"
    # model_basename = "orca-mini-3b.ggmlv3.q4_0.bin"

-    model_id="TheBloke/Llama-2-7B-Chat-GGML"
+    model_id = "TheBloke/Llama-2-7B-Chat-GGML"
    model_basename = "llama-2-7b-chat.ggmlv3.q4_0.bin"

    llm = load_model(device_type, model_id=model_id, model_basename=model_basename)