Haystack も使ってみた
By takagiwa on Thursday, February 29 2024, 22:04 - LLM - Permalink
日本語を使うには一手間要りそう
(base) E:\Projects>mkdir haystack (base) E:\Projects>cd haystack (base) E:\Projects\haystack>pip install farm-haystack -f https://download.pytorch.org/whl/torch_stable.html (base) E:\Projects\haystack>python test.py File "E:\Projects\haystack\test.py", line 14 add_example_data(document_store, "D:\Nextcloud\Books") ^ SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: malformed \N character escape (base) E:\Projects\haystack>python test.py None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used. Traceback (most recent call last): File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\document_stores\memory.py", line 29, in <module> import torch ModuleNotFoundError: No module named 'torch' The above exception was the direct cause of the following exception: Traceback (most recent call last): File "E:\Projects\haystack\test.py", line 9, in <module> document_store = InMemoryDocumentStore(use_bm25=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 46, in wrapper_exportable_to_yaml init_func(self, *args, **kwargs) File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\document_stores\memory.py", line 95, in __init__ torch_import.check() File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\lazy_imports\try_import.py", line 107, in check raise ImportError(message) from exc_value ImportError: Failed to import 'torch'. Run 'pip install farm-haystack[inference]'. Original error: No module named 'torch' (base) E:\Projects\haystack>pip install torch (base) E:\Projects\haystack>python test.py Traceback (most recent call last): File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\file_converter\pdf_xpdf.py", line 46, in __init__ subprocess.run(["pdftotext", "-v"], shell=False, check=False) File "C:\Users\xxxxxxxx\anaconda3\Lib\subprocess.py", line 548, in run with Popen(*popenargs, **kwargs) as process: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\subprocess.py", line 1026, in __init__ self._execute_child(args, executable, preexec_fn, close_fds, File "C:\Users\xxxxxxxx\anaconda3\Lib\subprocess.py", line 1538, in _execute_child hp, ht, pid, tid = _winapi.CreateProcess(executable, args, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ FileNotFoundError: [WinError 2] 指定されたファイルが見つかりません。 During handling of the above exception, another exception occurred: Traceback (most recent call last): File "E:\Projects\haystack\test.py", line 14, in <module> add_example_data(document_store, "D:\\Nextcloud\\Books") File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\utils\getting_started.py", line 77, in add_example_data docs = convert_files_to_docs(dir_path=dir) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\utils\preprocessing.py", line 69, in convert_files_to_docs suffix2converter[file_suffix] = PDFToTextConverter() ^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 46, in wrapper_exportable_to_yaml init_func(self, *args, **kwargs) File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\file_converter\pdf_xpdf.py", line 48, in __init__ raise FileNotFoundError( FileNotFoundError: pdftotext is not installed. It is part of xpdf or poppler-utils software suite. Installation on Linux: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin Installation on MacOS: brew install xpdf You can find more details here: https://www.xpdfreader.com https://rinsaka.com/python/tesseract/win02.html#py-install (base) E:\Projects\haystack>conda install -c conda-forge poppler (base) E:\Projects\haystack>pip install pdftotext The prompt has been truncated from 21062 tokens to 3988 tokens so that the prompt length and answer length (100 tokens) fit within the max token limit (4096 tokens). Reduce the length of the prompt to prevent it from being cut off. Traceback (most recent call last): File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\pipelines\base.py", line 567, in run node_output, stream_id = self._run_node(node_id, node_input) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\pipelines\base.py", line 469, in _run_node return self.graph.nodes[node_id]["component"]._dispatch_run(**node_input) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 201, in _dispatch_run return self._dispatch_run_general(self.run, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 245, in _dispatch_run_general output, stream = run_method(**run_inputs, **run_params) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\prompt_node.py", line 312, in run results = self(**invocation_context, prompt_collector=prompt_collector) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\prompt_node.py", line 140, in __call__ return self.prompt(prompt_template, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\prompt_node.py", line 169, in prompt output = self.prompt_model.invoke(prompt, **kwargs_copy) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\prompt_model.py", line 129, in invoke output = self.model_invocation_layer.invoke(prompt=prompt, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\invocation_layer\chatgpt.py", line 196, in invoke response = openai_request(url=self.url, headers=self.headers, payload=payload, timeout=self.timeout) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\tenacity\__init__.py", line 289, in wrapped_f return self(f, *args, **kw) ^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\tenacity\__init__.py", line 379, in __call__ do = self.iter(retry_state=retry_state) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\tenacity\__init__.py", line 314, in iter return fut.result() ^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\concurrent\futures\_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\concurrent\futures\_base.py", line 401, in __get_result raise self._exception File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\tenacity\__init__.py", line 382, in __call__ result = fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\utils\openai_utils.py", line 149, in openai_request raise openai_error haystack.errors.OpenAIUnauthorizedError: API key is invalid: { "error": { "message": "Incorrect API key provided: sk-.... You can find your API key at https://platform.openai.com/account/api-keys.", "type": "invalid_request_error", "param": null, "code": "invalid_api_key" } } The above exception was the direct cause of the following exception: Traceback (most recent call last): File "E:\Projects\haystack\test.py", line 20, in <module> result = pipeline.run(query="What is PPU?") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\pipelines\base.py", line 574, in run raise Exception( Exception: Exception while running node 'prompt_node': API key is invalid: { "error": { "message": "Incorrect API key provided: sk-.... You can find your API key at https://platform.openai.com/account/api-keys.", "type": "invalid_request_error", "param": null, "code": "invalid_api_key" } } Enable debug logging to see the data that was passed when the pipeline failed. https://haystack.deepset.ai/tutorials/01_basic_qa_pipeline (base) E:\Projects\haystack>python test2.py C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\transformers\utils\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. _torch_pytree._register_pytree_node( C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\transformers\utils\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. _torch_pytree._register_pytree_node( C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\transformers\utils\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. _torch_pytree._register_pytree_node( INFO - haystack.modeling.utils - Using devices: CPU - Number of GPUs: 0 INFO - haystack.utils.import_utils - Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip to 'data/build_your_first_question_answering_system' INFO - haystack.pipelines.base - It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch. Converting files: 100%|█████████████████████████████████████████████████████████████| 183/183 [00:01<00:00, 102.50it/s] Preprocessing: 0%| | 0/183 [00:00<?, ?docs/s]WARNING - haystack.nodes.preprocessor.preprocessor - We found one or more sentences whose split count is higher than the split length. Preprocessing: 50%|███████████████████████████████▋ | 92/183 [00:00<00:00, 231.54docs/s]WARNING - haystack.nodes.preprocessor.preprocessor - Document 4189b42892b3d941c035947d512b69dd is 12059 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively. Preprocessing: 80%|█████████████████████████████████████████████████▍ | 146/183 [00:00<00:00, 250.02docs/s]WARNING - haystack.nodes.preprocessor.preprocessor - Document dd048b8e5bcb7de1be5bd3937f15442f is 14232 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively. WARNING - haystack.nodes.preprocessor.preprocessor - Document b48fb0da693eb4d81b3566d0069868b3 is 10488 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively. Preprocessing: 100%|██████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 258.99docs/s] Updating BM25 representation...: 100%|███████████████████████████████████████| 2359/2359 [00:00<00:00, 27915.40 docs/s] Traceback (most recent call last): File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\reader\farm.py", line 38, in <module> from haystack.modeling.training import Trainer, DistillationTrainer, TinyBERTDistillationTrainer File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\training\__init__.py", line 1, in <module> from haystack.modeling.training.base import Trainer, DistillationTrainer, TinyBERTDistillationTrainer File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\training\base.py", line 17, in <module> from haystack.modeling.evaluation.eval import Evaluator File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\evaluation\__init__.py", line 1, in <module> from haystack.modeling.evaluation.eval import Evaluator File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\evaluation\eval.py", line 10, in <module> from haystack.modeling.evaluation.metrics import compute_metrics, compute_report_metrics File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\evaluation\metrics.py", line 6, in <module> from sentence_transformers import CrossEncoder, SentenceTransformer ModuleNotFoundError: No module named 'sentence_transformers' The above exception was the direct cause of the following exception: Traceback (most recent call last): File "E:\Projects\haystack\test2.py", line 37, in <module> reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 46, in wrapper_exportable_to_yaml init_func(self, *args, **kwargs) File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\reader\farm.py", line 146, in __init__ torch_and_transformers_import.check() File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\lazy_imports\try_import.py", line 107, in check raise ImportError(message) from exc_value ImportError: Failed to import 'sentence_transformers'. Run 'pip install farm-haystack[inference]'. Original error: No module named 'sentence_transformers' (base) E:\Projects\haystack>pip install sentence_transformers (base) E:\Projects\haystack>python test2.py
ここで test2.py は
import logging logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) logging.getLogger("haystack").setLevel(logging.INFO) from haystack.document_stores import InMemoryDocumentStore document_store = InMemoryDocumentStore(use_bm25=True) from haystack.utils import fetch_archive_from_http doc_dir = "data/build_your_first_question_answering_system" fetch_archive_from_http( url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip", output_dir=doc_dir, ) import os from haystack.pipelines.standard_pipelines import TextIndexingPipeline files_to_index = [doc_dir + "\" + f for f in os.listdir(doc_dir)] indexing_pipeline = TextIndexingPipeline(document_store) indexing_pipeline.run_batch(file_paths=files_to_index) from haystack.nodes import BM25Retriever retriever = BM25Retriever(document_store=document_store) from haystack.nodes import FARMReader reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) from haystack.pipelines import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) prediction = pipe.run( query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} ) from pprint import pprint pprint(prediction) from haystack.utils import print_answers print_answers(prediction, details="minimum")
これでサンプルは動いてくれたけれど、日本語 PDF を読み込ませて英語で聞いてみてもちゃんと取れなかったよう。