(base) E:\Projects>mkdir haystack
(base) E:\Projects>cd haystack
(base) E:\Projects\haystack>pip install farm-haystack -f https://download.pytorch.org/whl/torch_stable.html
(base) E:\Projects\haystack>python test.py
File "E:\Projects\haystack\test.py", line 14
add_example_data(document_store, "D:\Nextcloud\Books")
^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: malformed \N character escape
(base) E:\Projects\haystack>python test.py
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Traceback (most recent call last):
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\document_stores\memory.py", line 29, in <module>
import torch
ModuleNotFoundError: No module named 'torch'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "E:\Projects\haystack\test.py", line 9, in <module>
document_store = InMemoryDocumentStore(use_bm25=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 46, in wrapper_exportable_to_yaml
init_func(self, *args, **kwargs)
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\document_stores\memory.py", line 95, in __init__
torch_import.check()
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\lazy_imports\try_import.py", line 107, in check
raise ImportError(message) from exc_value
ImportError: Failed to import 'torch'. Run 'pip install farm-haystack[inference]'. Original error: No module named 'torch'
(base) E:\Projects\haystack>pip install torch
(base) E:\Projects\haystack>python test.py
Traceback (most recent call last):
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\file_converter\pdf_xpdf.py", line 46, in __init__
subprocess.run(["pdftotext", "-v"], shell=False, check=False)
File "C:\Users\xxxxxxxx\anaconda3\Lib\subprocess.py", line 548, in run
with Popen(*popenargs, **kwargs) as process:
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\subprocess.py", line 1026, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "C:\Users\xxxxxxxx\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [WinError 2] 指定されたファイルが見つかりません。
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:\Projects\haystack\test.py", line 14, in <module>
add_example_data(document_store, "D:\\Nextcloud\\Books")
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\utils\getting_started.py", line 77, in add_example_data
docs = convert_files_to_docs(dir_path=dir)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\utils\preprocessing.py", line 69, in convert_files_to_docs
suffix2converter[file_suffix] = PDFToTextConverter()
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 46, in wrapper_exportable_to_yaml
init_func(self, *args, **kwargs)
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\file_converter\pdf_xpdf.py", line 48, in __init__
raise FileNotFoundError(
FileNotFoundError: pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
Installation on Linux:
wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz &&
tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
Installation on MacOS:
brew install xpdf
You can find more details here: https://www.xpdfreader.com
https://rinsaka.com/python/tesseract/win02.html#py-install
(base) E:\Projects\haystack>conda install -c conda-forge poppler
(base) E:\Projects\haystack>pip install pdftotext
The prompt has been truncated from 21062 tokens to 3988 tokens so that the prompt length and answer length (100 tokens) fit within the max token limit (4096 tokens). Reduce the length of the prompt to prevent it from being cut off.
Traceback (most recent call last):
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\pipelines\base.py", line 567, in run
node_output, stream_id = self._run_node(node_id, node_input)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\pipelines\base.py", line 469, in _run_node
return self.graph.nodes[node_id]["component"]._dispatch_run(**node_input)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 201, in _dispatch_run
return self._dispatch_run_general(self.run, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 245, in _dispatch_run_general
output, stream = run_method(**run_inputs, **run_params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\prompt_node.py", line 312, in run
results = self(**invocation_context, prompt_collector=prompt_collector)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\prompt_node.py", line 140, in __call__
return self.prompt(prompt_template, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\prompt_node.py", line 169, in prompt
output = self.prompt_model.invoke(prompt, **kwargs_copy)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\prompt_model.py", line 129, in invoke
output = self.model_invocation_layer.invoke(prompt=prompt, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\prompt\invocation_layer\chatgpt.py", line 196, in invoke
response = openai_request(url=self.url, headers=self.headers, payload=payload, timeout=self.timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\tenacity\__init__.py", line 289, in wrapped_f
return self(f, *args, **kw)
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\tenacity\__init__.py", line 379, in __call__
do = self.iter(retry_state=retry_state)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\tenacity\__init__.py", line 314, in iter
return fut.result()
^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\concurrent\futures\_base.py", line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\concurrent\futures\_base.py", line 401, in __get_result
raise self._exception
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\tenacity\__init__.py", line 382, in __call__
result = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\utils\openai_utils.py", line 149, in openai_request
raise openai_error
haystack.errors.OpenAIUnauthorizedError: API key is invalid: {
"error": {
"message": "Incorrect API key provided: sk-.... You can find your API key at https://platform.openai.com/account/api-keys.",
"type": "invalid_request_error",
"param": null,
"code": "invalid_api_key"
}
}
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "E:\Projects\haystack\test.py", line 20, in <module>
result = pipeline.run(query="What is PPU?")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\pipelines\base.py", line 574, in run
raise Exception(
Exception: Exception while running node 'prompt_node': API key is invalid: {
"error": {
"message": "Incorrect API key provided: sk-.... You can find your API key at https://platform.openai.com/account/api-keys.",
"type": "invalid_request_error",
"param": null,
"code": "invalid_api_key"
}
}
Enable debug logging to see the data that was passed when the pipeline failed.
https://haystack.deepset.ai/tutorials/01_basic_qa_pipeline
(base) E:\Projects\haystack>python test2.py
C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\transformers\utils\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
_torch_pytree._register_pytree_node(
C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\transformers\utils\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
_torch_pytree._register_pytree_node(
C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\transformers\utils\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
_torch_pytree._register_pytree_node(
INFO - haystack.modeling.utils - Using devices: CPU - Number of GPUs: 0
INFO - haystack.utils.import_utils - Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip to 'data/build_your_first_question_answering_system'
INFO - haystack.pipelines.base - It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files: 100%|█████████████████████████████████████████████████████████████| 183/183 [00:01<00:00, 102.50it/s]
Preprocessing: 0%| | 0/183 [00:00<?, ?docs/s]WARNING - haystack.nodes.preprocessor.preprocessor - We found one or more sentences whose split count is higher than the split length.
Preprocessing: 50%|███████████████████████████████▋ | 92/183 [00:00<00:00, 231.54docs/s]WARNING - haystack.nodes.preprocessor.preprocessor - Document 4189b42892b3d941c035947d512b69dd is 12059 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing: 80%|█████████████████████████████████████████████████▍ | 146/183 [00:00<00:00, 250.02docs/s]WARNING - haystack.nodes.preprocessor.preprocessor - Document dd048b8e5bcb7de1be5bd3937f15442f is 14232 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
WARNING - haystack.nodes.preprocessor.preprocessor - Document b48fb0da693eb4d81b3566d0069868b3 is 10488 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing: 100%|██████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 258.99docs/s]
Updating BM25 representation...: 100%|███████████████████████████████████████| 2359/2359 [00:00<00:00, 27915.40 docs/s]
Traceback (most recent call last):
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\reader\farm.py", line 38, in <module>
from haystack.modeling.training import Trainer, DistillationTrainer, TinyBERTDistillationTrainer
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\training\__init__.py", line 1, in <module>
from haystack.modeling.training.base import Trainer, DistillationTrainer, TinyBERTDistillationTrainer
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\training\base.py", line 17, in <module>
from haystack.modeling.evaluation.eval import Evaluator
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\evaluation\__init__.py", line 1, in <module>
from haystack.modeling.evaluation.eval import Evaluator
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\evaluation\eval.py", line 10, in <module>
from haystack.modeling.evaluation.metrics import compute_metrics, compute_report_metrics
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\modeling\evaluation\metrics.py", line 6, in <module>
from sentence_transformers import CrossEncoder, SentenceTransformer
ModuleNotFoundError: No module named 'sentence_transformers'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "E:\Projects\haystack\test2.py", line 37, in <module>
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\base.py", line 46, in wrapper_exportable_to_yaml
init_func(self, *args, **kwargs)
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\haystack\nodes\reader\farm.py", line 146, in __init__
torch_and_transformers_import.check()
File "C:\Users\xxxxxxxx\anaconda3\Lib\site-packages\lazy_imports\try_import.py", line 107, in check
raise ImportError(message) from exc_value
ImportError: Failed to import 'sentence_transformers'. Run 'pip install farm-haystack[inference]'. Original error: No module named 'sentence_transformers'
(base) E:\Projects\haystack>pip install sentence_transformers
(base) E:\Projects\haystack>python test2.py
ここで test2.py は
import logging
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from haystack.document_stores import InMemoryDocumentStore
document_store = InMemoryDocumentStore(use_bm25=True)
from haystack.utils import fetch_archive_from_http
doc_dir = "data/build_your_first_question_answering_system"
fetch_archive_from_http(
url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip",
output_dir=doc_dir,
)
import os
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
files_to_index = [doc_dir + "\" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)
from haystack.nodes import BM25Retriever
retriever = BM25Retriever(document_store=document_store)
from haystack.nodes import FARMReader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
from haystack.pipelines import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(reader, retriever)
prediction = pipe.run(
query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)
from pprint import pprint
pprint(prediction)
from haystack.utils import print_answers
print_answers(prediction, details="minimum")
これでサンプルは動いてくれたけれど、日本語 PDF を読み込ませて英語で聞いてみてもちゃんと取れなかったよう。