Skip to content

Commit 7523b1f

Browse files
author
Dongjin
committed
[MODIFY] filter for parent doc.
1 parent ddd9520 commit 7523b1f

File tree

130 files changed

+463
-929
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

130 files changed

+463
-929
lines changed

genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs/05_0_load_complex_pdf_kr_opensearch.ipynb

Lines changed: 6 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@
137137
},
138138
"outputs": [],
139139
"source": [
140-
"from langchain_community.chat_models import BedrockChat\n",
140+
"from langchain_aws import ChatBedrock\n",
141141
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler"
142142
]
143143
},
@@ -150,7 +150,7 @@
150150
},
151151
"outputs": [],
152152
"source": [
153-
"llm_text = BedrockChat(\n",
153+
"llm_text = ChatBedrock(\n",
154154
" model_id=bedrock_info.get_model_id(model_name=\"Claude-V3-Sonnet\"),\n",
155155
" client=boto3_bedrock,\n",
156156
" streaming=True,\n",
@@ -504,7 +504,7 @@
504504
},
505505
"outputs": [],
506506
"source": [
507-
"table_by_llama_parse = True"
507+
"table_by_llama_parse = False"
508508
]
509509
},
510510
{
@@ -517,7 +517,7 @@
517517
"outputs": [],
518518
"source": [
519519
"def api_key():\n",
520-
" os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-R4G3Pzu5IZIdq5AoAFILW1PPaVZxrVRN937R6f3cItBvPs1U\"\n",
520+
" os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"<your key>\"\n",
521521
" nest_asyncio.apply()\n",
522522
" load_dotenv()"
523523
]
@@ -617,7 +617,7 @@
617617
},
618618
"outputs": [],
619619
"source": [
620-
"table_by_pymupdf = True"
620+
"table_by_pymupdf = False"
621621
]
622622
},
623623
{
@@ -803,32 +803,6 @@
803803
"# tables_camleot[0].df.to_markdown()"
804804
]
805805
},
806-
{
807-
"cell_type": "code",
808-
"execution_count": null,
809-
"id": "825d1b1f-5208-4a1d-8ee9-971a493d7bfb",
810-
"metadata": {
811-
"tags": []
812-
},
813-
"outputs": [],
814-
"source": [
815-
"# for image in images:\n",
816-
" \n",
817-
"# img = cv2.imread(image) \n",
818-
"# width, height, _ = img.shape\n",
819-
"# image_token = width*height/750\n",
820-
"# print (f'image: {image}, shape: {img.shape}, image_token_for_claude3: {image_token}' )\n",
821-
" \n",
822-
"# if image_token > 1500:\n",
823-
"# resize_img = cv2.resize(img, (0, 0), fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)\n",
824-
"# print(\" - resize_img.shape = {0}\".format(resize_img.shape))\n",
825-
"# table_image_path = image.replace(\".jpg\", \"-resize.jpg\")\n",
826-
"# cv2.imwrite(table_image_path, resize_img)\n",
827-
"# os.remove(image)\n",
828-
" \n",
829-
"# images = glob(os.path.join(image_path, \"*\"))"
830-
]
831-
},
832806
{
833807
"cell_type": "markdown",
834808
"id": "fb16fd2a-1983-462d-94d8-4c62e81ef28c",
@@ -1102,18 +1076,6 @@
11021076
"summarize_chain = {\"table\": lambda x:x} | prompt | llm_text | StrOutputParser()"
11031077
]
11041078
},
1105-
{
1106-
"cell_type": "code",
1107-
"execution_count": null,
1108-
"id": "7a0bc0a5-7f5e-4bb8-9f0f-bbaf31622448",
1109-
"metadata": {
1110-
"tags": []
1111-
},
1112-
"outputs": [],
1113-
"source": [
1114-
"len(tables), len(docs_table_pymupdf), len(docs_table_llamaparse)"
1115-
]
1116-
},
11171079
{
11181080
"cell_type": "code",
11191081
"execution_count": null,
@@ -1223,18 +1185,6 @@
12231185
"#tables_preprocessed, images_preprocessed\n"
12241186
]
12251187
},
1226-
{
1227-
"cell_type": "code",
1228-
"execution_count": null,
1229-
"id": "d3504e83-d62e-40a9-9cdc-d5fbea2d7029",
1230-
"metadata": {
1231-
"tags": []
1232-
},
1233-
"outputs": [],
1234-
"source": [
1235-
"tables_preprocessed[0]"
1236-
]
1237-
},
12381188
{
12391189
"cell_type": "code",
12401190
"execution_count": null,
@@ -1314,7 +1264,7 @@
13141264
"outputs": [],
13151265
"source": [
13161266
"#index_name = \"kb_complex_doc\"\n",
1317-
"index_name = \"summit-workshop-index-unstructured-pymupdf-llama\" #summit-workshop-index-unstructured, #summit-workshop-index-unstructured-pymupdf"
1267+
"index_name = \"summit-workshop-index\" #summit-workshop-index-unstructured, #summit-workshop-index-unstructured-pymupdf, summit-workshop-index-unstructured-pymupdf-llama"
13181268
]
13191269
},
13201270
{

0 commit comments

Comments
 (0)