|
137 | 137 | },
|
138 | 138 | "outputs": [],
|
139 | 139 | "source": [
|
140 |
| - "from langchain_community.chat_models import BedrockChat\n", |
| 140 | + "from langchain_aws import ChatBedrock\n", |
141 | 141 | "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler"
|
142 | 142 | ]
|
143 | 143 | },
|
|
150 | 150 | },
|
151 | 151 | "outputs": [],
|
152 | 152 | "source": [
|
153 |
| - "llm_text = BedrockChat(\n", |
| 153 | + "llm_text = ChatBedrock(\n", |
154 | 154 | " model_id=bedrock_info.get_model_id(model_name=\"Claude-V3-Sonnet\"),\n",
|
155 | 155 | " client=boto3_bedrock,\n",
|
156 | 156 | " streaming=True,\n",
|
|
504 | 504 | },
|
505 | 505 | "outputs": [],
|
506 | 506 | "source": [
|
507 |
| - "table_by_llama_parse = True" |
| 507 | + "table_by_llama_parse = False" |
508 | 508 | ]
|
509 | 509 | },
|
510 | 510 | {
|
|
517 | 517 | "outputs": [],
|
518 | 518 | "source": [
|
519 | 519 | "def api_key():\n",
|
520 |
| - " os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-R4G3Pzu5IZIdq5AoAFILW1PPaVZxrVRN937R6f3cItBvPs1U\"\n", |
| 520 | + " os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"<your key>\"\n", |
521 | 521 | " nest_asyncio.apply()\n",
|
522 | 522 | " load_dotenv()"
|
523 | 523 | ]
|
|
617 | 617 | },
|
618 | 618 | "outputs": [],
|
619 | 619 | "source": [
|
620 |
| - "table_by_pymupdf = True" |
| 620 | + "table_by_pymupdf = False" |
621 | 621 | ]
|
622 | 622 | },
|
623 | 623 | {
|
|
803 | 803 | "# tables_camleot[0].df.to_markdown()"
|
804 | 804 | ]
|
805 | 805 | },
|
806 |
| - { |
807 |
| - "cell_type": "code", |
808 |
| - "execution_count": null, |
809 |
| - "id": "825d1b1f-5208-4a1d-8ee9-971a493d7bfb", |
810 |
| - "metadata": { |
811 |
| - "tags": [] |
812 |
| - }, |
813 |
| - "outputs": [], |
814 |
| - "source": [ |
815 |
| - "# for image in images:\n", |
816 |
| - " \n", |
817 |
| - "# img = cv2.imread(image) \n", |
818 |
| - "# width, height, _ = img.shape\n", |
819 |
| - "# image_token = width*height/750\n", |
820 |
| - "# print (f'image: {image}, shape: {img.shape}, image_token_for_claude3: {image_token}' )\n", |
821 |
| - " \n", |
822 |
| - "# if image_token > 1500:\n", |
823 |
| - "# resize_img = cv2.resize(img, (0, 0), fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)\n", |
824 |
| - "# print(\" - resize_img.shape = {0}\".format(resize_img.shape))\n", |
825 |
| - "# table_image_path = image.replace(\".jpg\", \"-resize.jpg\")\n", |
826 |
| - "# cv2.imwrite(table_image_path, resize_img)\n", |
827 |
| - "# os.remove(image)\n", |
828 |
| - " \n", |
829 |
| - "# images = glob(os.path.join(image_path, \"*\"))" |
830 |
| - ] |
831 |
| - }, |
832 | 806 | {
|
833 | 807 | "cell_type": "markdown",
|
834 | 808 | "id": "fb16fd2a-1983-462d-94d8-4c62e81ef28c",
|
|
1102 | 1076 | "summarize_chain = {\"table\": lambda x:x} | prompt | llm_text | StrOutputParser()"
|
1103 | 1077 | ]
|
1104 | 1078 | },
|
1105 |
| - { |
1106 |
| - "cell_type": "code", |
1107 |
| - "execution_count": null, |
1108 |
| - "id": "7a0bc0a5-7f5e-4bb8-9f0f-bbaf31622448", |
1109 |
| - "metadata": { |
1110 |
| - "tags": [] |
1111 |
| - }, |
1112 |
| - "outputs": [], |
1113 |
| - "source": [ |
1114 |
| - "len(tables), len(docs_table_pymupdf), len(docs_table_llamaparse)" |
1115 |
| - ] |
1116 |
| - }, |
1117 | 1079 | {
|
1118 | 1080 | "cell_type": "code",
|
1119 | 1081 | "execution_count": null,
|
|
1223 | 1185 | "#tables_preprocessed, images_preprocessed\n"
|
1224 | 1186 | ]
|
1225 | 1187 | },
|
1226 |
| - { |
1227 |
| - "cell_type": "code", |
1228 |
| - "execution_count": null, |
1229 |
| - "id": "d3504e83-d62e-40a9-9cdc-d5fbea2d7029", |
1230 |
| - "metadata": { |
1231 |
| - "tags": [] |
1232 |
| - }, |
1233 |
| - "outputs": [], |
1234 |
| - "source": [ |
1235 |
| - "tables_preprocessed[0]" |
1236 |
| - ] |
1237 |
| - }, |
1238 | 1188 | {
|
1239 | 1189 | "cell_type": "code",
|
1240 | 1190 | "execution_count": null,
|
|
1314 | 1264 | "outputs": [],
|
1315 | 1265 | "source": [
|
1316 | 1266 | "#index_name = \"kb_complex_doc\"\n",
|
1317 |
| - "index_name = \"summit-workshop-index-unstructured-pymupdf-llama\" #summit-workshop-index-unstructured, #summit-workshop-index-unstructured-pymupdf" |
| 1267 | + "index_name = \"summit-workshop-index\" #summit-workshop-index-unstructured, #summit-workshop-index-unstructured-pymupdf, summit-workshop-index-unstructured-pymupdf-llama" |
1318 | 1268 | ]
|
1319 | 1269 | },
|
1320 | 1270 | {
|
|
0 commit comments