diff --git a/CHANGELOG.md b/CHANGELOG.md index 350df53d8..0996b5bb3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.5.4 + +### Enhancements + +* **Sharepoint support for nested folders and remove need for default path Shared Documents** + ## 0.5.3 ### Enhancements diff --git a/test/integration/connectors/expected_results/sharepoint/directory_structure.json b/test/integration/connectors/expected_results/sharepoint/directory_structure.json index a5f52ed4e..738b58633 100644 --- a/test/integration/connectors/expected_results/sharepoint/directory_structure.json +++ b/test/integration/connectors/expected_results/sharepoint/directory_structure.json @@ -1,8 +1,5 @@ { "directory_structure": [ - "Folder1/Folder2/fake-email.txt", - "Folder1/fake-memo.pdf", - "book-war-and-peace-1p.txt", - "list-item-example.pdf" + "fake-memo.pdf" ] } \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint/downloads/Folder1/fake-memo.pdf b/test/integration/connectors/expected_results/sharepoint/downloads/fake-memo.pdf similarity index 100% rename from test/integration/connectors/expected_results/sharepoint/downloads/Folder1/fake-memo.pdf rename to test/integration/connectors/expected_results/sharepoint/downloads/fake-memo.pdf diff --git a/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json b/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json deleted file mode 100644 index 4b86537b0..000000000 --- a/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "identifier": "0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ", - "connector_type": "sharepoint", - "source_identifiers": { - "filename": "list-item-example.pdf", - "fullpath": "/list-item-example.pdf", - "rel_path": "list-item-example.pdf" - }, - "metadata": { - "url": "/drive/root:/list-item-example.pdf", - "version": "\"{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", - "record_locator": { - "user_pname": "devops@unstructuredio.onmicrosoft.com", - "server_relative_path": "/list-item-example.pdf" - }, - "date_created": "1738353995.0", - "date_modified": "1738353995.0", - "date_processed": "1738857995.149765", - "permissions_data": null, - "filesize_bytes": null - }, - "additional_metadata": { - "@microsoft.graph.downloadUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/_layouts/15/download.aspx?UniqueId=94bc1801-87a2-4b52-a177-25324bb17ae9&Translate=false&tempauth=v1.eyJzaXRlaWQiOiJhNmY1NjcwNS1hZjI5LTQ2YzctOTBiYS05YTBkNWE3YTFlZWMiLCJhcHBfZGlzcGxheW5hbWUiOiJzaGFyZXBvaW50LWFwcC1yZWdpc3RyYXRpb24iLCJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvdW5zdHJ1Y3R1cmVkaW8uc2hhcmVwb2ludC5jb21AM2Q2MGE3ZTUtMWUzMi00MTRlLTgzOWItMWM2ZTY3ODI2MTNkIiwiZXhwIjoiMTczODg2MTU4OCJ9.CgoKBHNuaWQSAjY0EgsIztK48se04z0QBRoNNDAuMTI2LjIzLjE2Myosb1V4bW51M2Z1V1ZLTXdlZTVXUFNwSmJJWlNodDJheS9rTGFwTG5YdkJFcz0wnQE4AUIQoX7jAZEAAHDEwbtGLnBa9koQaGFzaGVkcHJvb2Z0b2tlbnoBMboBZXNoYXJlcG9pbnR0ZW5hbnRzZXR0aW5ncy5yZWFkd3JpdGUuYWxsIGFsbHNpdGVzLndyaXRlIGFsbHNpdGVzLm1hbmFnZSBhbGxmaWxlcy53cml0ZSBhbGxwcm9maWxlcy5yZWFkwgFJNmMxNjA3NTMtOWI2My00NzA5LWExNDAtMTdhMjdkMzA4N2E2QDNkNjBhN2U1LTFlMzItNDE0ZS04MzliLTFjNmU2NzgyNjEzZMgBAQ.C0ACRlYqk-wv1NYQStPJkYsqhWBliB4yPC3Bfvb7L9E&ApiVersion=2.0", - "eTag": "\"{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", - "id": "0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ", - "name": "list-item-example.pdf", - "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/list-item-example.pdf", - "cTag": "\"c:{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", - "size": 48981 - }, - "reprocess": false, - "local_download_path": "/private/var/folders/n8/rps3wl195pj4p_0vyxqj5jrw0000gn/T/tmpebidcrdb/list-item-example.pdf", - "display_name": null -} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json b/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json deleted file mode 100644 index 984503855..000000000 --- a/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "identifier": "0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG", - "connector_type": "sharepoint", - "source_identifiers": { - "filename": "book-war-and-peace-1p.txt", - "fullpath": "/book-war-and-peace-1p.txt", - "rel_path": "book-war-and-peace-1p.txt" - }, - "metadata": { - "url": "/drive/root:/book-war-and-peace-1p.txt", - "version": "\"{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", - "record_locator": { - "user_pname": "devops@unstructuredio.onmicrosoft.com", - "server_relative_path": "/book-war-and-peace-1p.txt" - }, - "date_created": "1738874878.0", - "date_modified": "1738874878.0", - "date_processed": "1738857990.276329", - "permissions_data": null, - "filesize_bytes": null - }, - "additional_metadata": { - "@microsoft.graph.downloadUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/_layouts/15/download.aspx?UniqueId=77650b15-a8aa-450e-bc8f-bb511a86d4c6&Translate=false&tempauth=v1.eyJzaXRlaWQiOiJhNmY1NjcwNS1hZjI5LTQ2YzctOTBiYS05YTBkNWE3YTFlZWMiLCJhcHBfZGlzcGxheW5hbWUiOiJzaGFyZXBvaW50LWFwcC1yZWdpc3RyYXRpb24iLCJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvdW5zdHJ1Y3R1cmVkaW8uc2hhcmVwb2ludC5jb21AM2Q2MGE3ZTUtMWUzMi00MTRlLTgzOWItMWM2ZTY3ODI2MTNkIiwiZXhwIjoiMTczODg2MTU4OCJ9.CgoKBHNuaWQSAjY0EgsIztK48se04z0QBRoNNDAuMTI2LjIzLjE2MyosemVNQkkvcjlFQVVxeHg1QTZ0SVFmSjdFY0lUK2xvL25sNmhINVQ3U0ZsVT0wnQE4AUIQoX7jAZEAAHDEwbtGLnBa9koQaGFzaGVkcHJvb2Z0b2tlbnoBMboBZXNoYXJlcG9pbnR0ZW5hbnRzZXR0aW5ncy5yZWFkd3JpdGUuYWxsIGFsbHNpdGVzLndyaXRlIGFsbHNpdGVzLm1hbmFnZSBhbGxmaWxlcy53cml0ZSBhbGxwcm9maWxlcy5yZWFkwgFJNmMxNjA3NTMtOWI2My00NzA5LWExNDAtMTdhMjdkMzA4N2E2QDNkNjBhN2U1LTFlMzItNDE0ZS04MzliLTFjNmU2NzgyNjEzZMgBAQ.bHf13EoNeJwt9Mfhr7KHDpwbwLaKvoPdCAQdBH5fptY&ApiVersion=2.0", - "eTag": "\"{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", - "id": "0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG", - "name": "book-war-and-peace-1p.txt", - "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/book-war-and-peace-1p.txt", - "cTag": "\"c:{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", - "size": 3045 - }, - "reprocess": false, - "local_download_path": "/private/var/folders/n8/rps3wl195pj4p_0vyxqj5jrw0000gn/T/tmpebidcrdb/book-war-and-peace-1p.txt", - "display_name": null -} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json b/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json deleted file mode 100644 index cbe4c0afe..000000000 --- a/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "identifier": "0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL", - "connector_type": "sharepoint", - "source_identifiers": { - "filename": "fake-memo.pdf", - "fullpath": "Folder1/fake-memo.pdf", - "rel_path": "Folder1/fake-memo.pdf" - }, - "metadata": { - "url": "/drive/root:/Folder1/fake-memo.pdf", - "version": "\"{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", - "record_locator": { - "user_pname": "devops@unstructuredio.onmicrosoft.com", - "server_relative_path": "Folder1/fake-memo.pdf" - }, - "date_created": "1738353979.0", - "date_modified": "1738353979.0", - "date_processed": "1738858000.113143", - "permissions_data": null, - "filesize_bytes": null - }, - "additional_metadata": { - "@microsoft.graph.downloadUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/_layouts/15/download.aspx?UniqueId=dacdc697-3d6c-4b30-8fcf-2a0635bf5c0b&Translate=false&tempauth=v1.eyJzaXRlaWQiOiJhNmY1NjcwNS1hZjI5LTQ2YzctOTBiYS05YTBkNWE3YTFlZWMiLCJhcHBfZGlzcGxheW5hbWUiOiJzaGFyZXBvaW50LWFwcC1yZWdpc3RyYXRpb24iLCJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvdW5zdHJ1Y3R1cmVkaW8uc2hhcmVwb2ludC5jb21AM2Q2MGE3ZTUtMWUzMi00MTRlLTgzOWItMWM2ZTY3ODI2MTNkIiwiZXhwIjoiMTczODg2MTU4OSJ9.CgoKBHNuaWQSAjY0EgsIloOq-se04z0QBRoNNDAuMTI2LjIzLjE2MyosQWRxcWhoUmM1M0J4b0lSTGM1cUU4UG1aOGlxN1dVT1hKaktPMDFDR045az0wnQE4AUIQoX7jAcOwAHDEwbaxZjmiwUoQaGFzaGVkcHJvb2Z0b2tlbnoBMboBZXNoYXJlcG9pbnR0ZW5hbnRzZXR0aW5ncy5yZWFkd3JpdGUuYWxsIGFsbHNpdGVzLndyaXRlIGFsbHNpdGVzLm1hbmFnZSBhbGxmaWxlcy53cml0ZSBhbGxwcm9maWxlcy5yZWFkwgFJNmMxNjA3NTMtOWI2My00NzA5LWExNDAtMTdhMjdkMzA4N2E2QDNkNjBhN2U1LTFlMzItNDE0ZS04MzliLTFjNmU2NzgyNjEzZMgBAQ.Y8pJb9_A3MSsyY-dHe54lAFtsws3rUmnNYzaatqcLHk&ApiVersion=2.0", - "eTag": "\"{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", - "id": "0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL", - "name": "fake-memo.pdf", - "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/Folder1/fake-memo.pdf", - "cTag": "\"c:{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", - "size": 13374 - }, - "reprocess": false, - "local_download_path": "/private/var/folders/n8/rps3wl195pj4p_0vyxqj5jrw0000gn/T/tmpebidcrdb/Folder1/fake-memo.pdf", - "display_name": null -} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json b/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json deleted file mode 100644 index 8907d8fcb..000000000 --- a/test/integration/connectors/expected_results/sharepoint/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "identifier": "0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT", - "connector_type": "sharepoint", - "source_identifiers": { - "filename": "fake-email.txt", - "fullpath": "Folder1/Folder2/fake-email.txt", - "rel_path": "Folder1/Folder2/fake-email.txt" - }, - "metadata": { - "url": "/drive/root:/Folder1/Folder2/fake-email.txt", - "version": "\"{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", - "record_locator": { - "user_pname": "devops@unstructuredio.onmicrosoft.com", - "server_relative_path": "Folder1/Folder2/fake-email.txt" - }, - "date_created": "1738353577.0", - "date_modified": "1738353577.0", - "date_processed": "1738858004.884376", - "permissions_data": null, - "filesize_bytes": null - }, - "additional_metadata": { - "@microsoft.graph.downloadUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/_layouts/15/download.aspx?UniqueId=bbb9ada5-a64e-4bee-8ed9-7fc9f66a38d3&Translate=false&tempauth=v1.eyJzaXRlaWQiOiJhNmY1NjcwNS1hZjI5LTQ2YzctOTBiYS05YTBkNWE3YTFlZWMiLCJhcHBfZGlzcGxheW5hbWUiOiJzaGFyZXBvaW50LWFwcC1yZWdpc3RyYXRpb24iLCJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvdW5zdHJ1Y3R1cmVkaW8uc2hhcmVwb2ludC5jb21AM2Q2MGE3ZTUtMWUzMi00MTRlLTgzOWItMWM2ZTY3ODI2MTNkIiwiZXhwIjoiMTczODg2MTU5MCJ9.CgoKBHNuaWQSAjY0EgsIwozog8i04z0QBRoNNDAuMTI2LjIzLjE2MyosUVo1V0pJa2twOU43RUdQMmRad0hKb0xWYjV5ajZhTzJGd1Fvd1lHYkdPMD0wnQE4AUIQoX7jAf_gAHDEwb6fitiweEoQaGFzaGVkcHJvb2Z0b2tlbnoBMboBZXNoYXJlcG9pbnR0ZW5hbnRzZXR0aW5ncy5yZWFkd3JpdGUuYWxsIGFsbHNpdGVzLndyaXRlIGFsbHNpdGVzLm1hbmFnZSBhbGxmaWxlcy53cml0ZSBhbGxwcm9maWxlcy5yZWFkwgFJNmMxNjA3NTMtOWI2My00NzA5LWExNDAtMTdhMjdkMzA4N2E2QDNkNjBhN2U1LTFlMzItNDE0ZS04MzliLTFjNmU2NzgyNjEzZMgBAQ.g5H4GW5a7RGxcRQuncDHONPF8uLL_kjE_5inH6KQYlY&ApiVersion=2.0", - "eTag": "\"{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", - "id": "0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT", - "name": "fake-email.txt", - "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/Folder1/Folder2/fake-email.txt", - "cTag": "\"c:{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", - "size": 836 - }, - "reprocess": false, - "local_download_path": "/private/var/folders/n8/rps3wl195pj4p_0vyxqj5jrw0000gn/T/tmpebidcrdb/Folder1/Folder2/fake-email.txt", - "display_name": null -} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint/file_data/01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3.json b/test/integration/connectors/expected_results/sharepoint/file_data/01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3.json new file mode 100644 index 000000000..e53d56059 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint/file_data/01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3.json @@ -0,0 +1,33 @@ +{ + "identifier": "01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "fake-memo.pdf", + "fullpath": "e2e-test-folder/fake-memo.pdf", + "rel_path": "fake-memo.pdf" + }, + "metadata": { + "url": "/drive/root:/e2e-test-folder/fake-memo.pdf", + "version": "\"{5883EA2B-1B4E-4744-82FF-4C89F5052A3B},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "e2e-test-folder/fake-memo.pdf" + }, + "date_created": "1738129296.0", + "date_modified": "1738129296.0", + "date_processed": "1739549929.526217", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{5883EA2B-1B4E-4744-82FF-4C89F5052A3B},1\"", + "id": "01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3", + "name": "fake-memo.pdf", + "webUrl": "https://unstructuredio.sharepoint.com/Shared%20Documents/e2e-test-folder/fake-memo.pdf", + "cTag": "\"c:{5883EA2B-1B4E-4744-82FF-4C89F5052A3B},1\"", + "size": 13374 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpxs1_nhhe/fake-memo.pdf", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint1/directory_structure.json b/test/integration/connectors/expected_results/sharepoint1/directory_structure.json new file mode 100644 index 000000000..a5f52ed4e --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint1/directory_structure.json @@ -0,0 +1,8 @@ +{ + "directory_structure": [ + "Folder1/Folder2/fake-email.txt", + "Folder1/fake-memo.pdf", + "book-war-and-peace-1p.txt", + "list-item-example.pdf" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint/downloads/Folder1/Folder2/fake-email.txt b/test/integration/connectors/expected_results/sharepoint1/downloads/Folder1/Folder2/fake-email.txt similarity index 100% rename from test/integration/connectors/expected_results/sharepoint/downloads/Folder1/Folder2/fake-email.txt rename to test/integration/connectors/expected_results/sharepoint1/downloads/Folder1/Folder2/fake-email.txt diff --git a/test/integration/connectors/expected_results/sharepoint1/downloads/Folder1/fake-memo.pdf b/test/integration/connectors/expected_results/sharepoint1/downloads/Folder1/fake-memo.pdf new file mode 100644 index 000000000..98db73b23 Binary files /dev/null and b/test/integration/connectors/expected_results/sharepoint1/downloads/Folder1/fake-memo.pdf differ diff --git a/test/integration/connectors/expected_results/sharepoint/downloads/book-war-and-peace-1p.txt b/test/integration/connectors/expected_results/sharepoint1/downloads/book-war-and-peace-1p.txt similarity index 100% rename from test/integration/connectors/expected_results/sharepoint/downloads/book-war-and-peace-1p.txt rename to test/integration/connectors/expected_results/sharepoint1/downloads/book-war-and-peace-1p.txt diff --git a/test/integration/connectors/expected_results/sharepoint/downloads/list-item-example.pdf b/test/integration/connectors/expected_results/sharepoint1/downloads/list-item-example.pdf similarity index 100% rename from test/integration/connectors/expected_results/sharepoint/downloads/list-item-example.pdf rename to test/integration/connectors/expected_results/sharepoint1/downloads/list-item-example.pdf diff --git a/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json b/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json new file mode 100644 index 000000000..d102e9c27 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "list-item-example.pdf", + "fullpath": "/list-item-example.pdf", + "rel_path": "list-item-example.pdf" + }, + "metadata": { + "url": "/drive/root:/list-item-example.pdf", + "version": "\"{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "/list-item-example.pdf" + }, + "date_created": "1738364795.0", + "date_modified": "1738364795.0", + "date_processed": "1739552937.759603", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", + "id": "0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ", + "name": "list-item-example.pdf", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/list-item-example.pdf", + "cTag": "\"c:{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", + "size": 48981 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpf70nr_lt/list-item-example.pdf", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json b/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json new file mode 100644 index 000000000..45f8d6c48 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "book-war-and-peace-1p.txt", + "fullpath": "/book-war-and-peace-1p.txt", + "rel_path": "book-war-and-peace-1p.txt" + }, + "metadata": { + "url": "/drive/root:/book-war-and-peace-1p.txt", + "version": "\"{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "/book-war-and-peace-1p.txt" + }, + "date_created": "1738885678.0", + "date_modified": "1738885678.0", + "date_processed": "1739552932.2096388", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", + "id": "0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG", + "name": "book-war-and-peace-1p.txt", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/book-war-and-peace-1p.txt", + "cTag": "\"c:{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", + "size": 3045 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpf70nr_lt/book-war-and-peace-1p.txt", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json b/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json new file mode 100644 index 000000000..8565f66d9 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "fake-memo.pdf", + "fullpath": "Folder1/fake-memo.pdf", + "rel_path": "Folder1/fake-memo.pdf" + }, + "metadata": { + "url": "/drive/root:/Folder1/fake-memo.pdf", + "version": "\"{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "Folder1/fake-memo.pdf" + }, + "date_created": "1738364779.0", + "date_modified": "1738364779.0", + "date_processed": "1739552943.296567", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "id": "0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL", + "name": "fake-memo.pdf", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/Folder1/fake-memo.pdf", + "cTag": "\"c:{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "size": 13374 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpf70nr_lt/Folder1/fake-memo.pdf", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json b/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json new file mode 100644 index 000000000..58bbca9ba --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint1/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "fake-email.txt", + "fullpath": "Folder1/Folder2/fake-email.txt", + "rel_path": "Folder1/Folder2/fake-email.txt" + }, + "metadata": { + "url": "/drive/root:/Folder1/Folder2/fake-email.txt", + "version": "\"{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "Folder1/Folder2/fake-email.txt" + }, + "date_created": "1738364377.0", + "date_modified": "1738364377.0", + "date_processed": "1739552950.261651", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "id": "0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT", + "name": "fake-email.txt", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/Folder1/Folder2/fake-email.txt", + "cTag": "\"c:{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "size": 836 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpf70nr_lt/Folder1/Folder2/fake-email.txt", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint2/directory_structure.json b/test/integration/connectors/expected_results/sharepoint2/directory_structure.json new file mode 100644 index 000000000..38a39737f --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint2/directory_structure.json @@ -0,0 +1,6 @@ +{ + "directory_structure": [ + "Folder2/fake-email.txt", + "fake-memo.pdf" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint2/downloads/Folder2/fake-email.txt b/test/integration/connectors/expected_results/sharepoint2/downloads/Folder2/fake-email.txt new file mode 100644 index 000000000..044766024 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint2/downloads/Folder2/fake-email.txt @@ -0,0 +1,25 @@ +MIME-Version: 1.0 +Date: Fri, 16 Dec 2022 17:04:16 -0500 +Message-ID: +Subject: Test Email +From: Matthew Robinson +To: Matthew Robinson +Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" + +--00000000000095c9b205eff92630 +Content-Type: text/plain; charset="UTF-8" + +This is a test email to use for unit tests. + +Important points: + + - Roses are red + - Violets are blue + - + +--00000000000095c9b205eff92630 +Content-Type: text/html; charset="UTF-8" + +
This is a test email to use for unit tests.

Important points:
  • Roses are red
  • Violets are blue
+ +--00000000000095c9b205eff92630-- \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint2/downloads/fake-memo.pdf b/test/integration/connectors/expected_results/sharepoint2/downloads/fake-memo.pdf new file mode 100644 index 000000000..98db73b23 Binary files /dev/null and b/test/integration/connectors/expected_results/sharepoint2/downloads/fake-memo.pdf differ diff --git a/test/integration/connectors/expected_results/sharepoint2/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json b/test/integration/connectors/expected_results/sharepoint2/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json new file mode 100644 index 000000000..efd2b7321 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint2/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "fake-memo.pdf", + "fullpath": "Folder1/fake-memo.pdf", + "rel_path": "fake-memo.pdf" + }, + "metadata": { + "url": "/drive/root:/Folder1/fake-memo.pdf", + "version": "\"{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "Folder1/fake-memo.pdf" + }, + "date_created": "1738364779.0", + "date_modified": "1738364779.0", + "date_processed": "1739552963.206111", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "id": "0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL", + "name": "fake-memo.pdf", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/Folder1/fake-memo.pdf", + "cTag": "\"c:{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "size": 13374 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpjuthxo3q/fake-memo.pdf", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint2/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json b/test/integration/connectors/expected_results/sharepoint2/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json new file mode 100644 index 000000000..c2a31af3a --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint2/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "fake-email.txt", + "fullpath": "Folder1/Folder2/fake-email.txt", + "rel_path": "Folder2/fake-email.txt" + }, + "metadata": { + "url": "/drive/root:/Folder1/Folder2/fake-email.txt", + "version": "\"{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "Folder1/Folder2/fake-email.txt" + }, + "date_created": "1738364377.0", + "date_modified": "1738364377.0", + "date_processed": "1739552968.488659", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "id": "0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT", + "name": "fake-email.txt", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/Folder1/Folder2/fake-email.txt", + "cTag": "\"c:{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "size": 836 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpjuthxo3q/Folder2/fake-email.txt", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint3/directory_structure.json b/test/integration/connectors/expected_results/sharepoint3/directory_structure.json new file mode 100644 index 000000000..738b58633 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint3/directory_structure.json @@ -0,0 +1,5 @@ +{ + "directory_structure": [ + "fake-memo.pdf" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint3/downloads/fake-memo.pdf b/test/integration/connectors/expected_results/sharepoint3/downloads/fake-memo.pdf new file mode 100644 index 000000000..98db73b23 Binary files /dev/null and b/test/integration/connectors/expected_results/sharepoint3/downloads/fake-memo.pdf differ diff --git a/test/integration/connectors/expected_results/sharepoint3/file_data/01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3.json b/test/integration/connectors/expected_results/sharepoint3/file_data/01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3.json new file mode 100644 index 000000000..9b743bd24 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint3/file_data/01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3.json @@ -0,0 +1,33 @@ +{ + "identifier": "01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "fake-memo.pdf", + "fullpath": "e2e-test-folder/fake-memo.pdf", + "rel_path": "fake-memo.pdf" + }, + "metadata": { + "url": "/drive/root:/e2e-test-folder/fake-memo.pdf", + "version": "\"{5883EA2B-1B4E-4744-82FF-4C89F5052A3B},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "e2e-test-folder/fake-memo.pdf" + }, + "date_created": "1738129296.0", + "date_modified": "1738129296.0", + "date_processed": "1739552981.163512", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{5883EA2B-1B4E-4744-82FF-4C89F5052A3B},1\"", + "id": "01QKP26QZL5KBVQTQ3IRDYF72MRH2QKKR3", + "name": "fake-memo.pdf", + "webUrl": "https://unstructuredio.sharepoint.com/Shared%20Documents/e2e-test-folder/fake-memo.pdf", + "cTag": "\"c:{5883EA2B-1B4E-4744-82FF-4C89F5052A3B},1\"", + "size": 13374 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmp6wq9s91s/fake-memo.pdf", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint4/directory_structure.json b/test/integration/connectors/expected_results/sharepoint4/directory_structure.json new file mode 100644 index 000000000..a5f52ed4e --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint4/directory_structure.json @@ -0,0 +1,8 @@ +{ + "directory_structure": [ + "Folder1/Folder2/fake-email.txt", + "Folder1/fake-memo.pdf", + "book-war-and-peace-1p.txt", + "list-item-example.pdf" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint4/downloads/Folder1/Folder2/fake-email.txt b/test/integration/connectors/expected_results/sharepoint4/downloads/Folder1/Folder2/fake-email.txt new file mode 100644 index 000000000..044766024 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint4/downloads/Folder1/Folder2/fake-email.txt @@ -0,0 +1,25 @@ +MIME-Version: 1.0 +Date: Fri, 16 Dec 2022 17:04:16 -0500 +Message-ID: +Subject: Test Email +From: Matthew Robinson +To: Matthew Robinson +Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" + +--00000000000095c9b205eff92630 +Content-Type: text/plain; charset="UTF-8" + +This is a test email to use for unit tests. + +Important points: + + - Roses are red + - Violets are blue + - + +--00000000000095c9b205eff92630 +Content-Type: text/html; charset="UTF-8" + +
This is a test email to use for unit tests.

Important points:
  • Roses are red
  • Violets are blue
+ +--00000000000095c9b205eff92630-- \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint4/downloads/Folder1/fake-memo.pdf b/test/integration/connectors/expected_results/sharepoint4/downloads/Folder1/fake-memo.pdf new file mode 100644 index 000000000..98db73b23 Binary files /dev/null and b/test/integration/connectors/expected_results/sharepoint4/downloads/Folder1/fake-memo.pdf differ diff --git a/test/integration/connectors/expected_results/sharepoint4/downloads/book-war-and-peace-1p.txt b/test/integration/connectors/expected_results/sharepoint4/downloads/book-war-and-peace-1p.txt new file mode 100644 index 000000000..962d915d2 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint4/downloads/book-war-and-peace-1p.txt @@ -0,0 +1,62 @@ +CHAPTER I + +"Well, Prince, so Genoa and Lucca are now just family estates of the +Buonapartes. But I warn you, if you don't tell me that this means war, +if you still try to defend the infamies and horrors perpetrated by that +Antichrist--I really believe he is Antichrist--I will have nothing more +to do with you and you are no longer my friend, no longer my 'faithful +slave,' as you call yourself! But how do you do? I see I have frightened +you--sit down and tell me all the news." + +It was in July, 1805, and the speaker was the well-known Anna Pavlovna +Scherer, maid of honor and favorite of the Empress Marya Fedorovna. With +these words she greeted Prince Vasili Kuragin, a man of high rank and +importance, who was the first to arrive at her reception. Anna Pavlovna +had had a cough for some days. She was, as she said, suffering from la +grippe; grippe being then a new word in St. Petersburg, used only by the +elite. + +All her invitations without exception, written in French, and delivered +by a scarlet-liveried footman that morning, ran as follows: + +"If you have nothing better to do, Count (or Prince), and if the +prospect of spending an evening with a poor invalid is not too terrible, +I shall be very charmed to see you tonight between 7 and 10--Annette +Scherer." + +"Heavens! what a virulent attack!" replied the prince, not in the least +disconcerted by this reception. He had just entered, wearing an +embroidered court uniform, knee breeches, and shoes, and had stars on +his breast and a serene expression on his flat face. He spoke in that +refined French in which our grandfathers not only spoke but thought, and +with the gentle, patronizing intonation natural to a man of importance +who had grown old in society and at court. He went up to Anna Pavlovna, +kissed her hand, presenting to her his bald, scented, and shining head, +and complacently seated himself on the sofa. + +"First of all, dear friend, tell me how you are. Set your friend's mind +at rest," said he without altering his tone, beneath the politeness and +affected sympathy of which indifference and even irony could be +discerned. + +"Can one be well while suffering morally? Can one be calm in times like +these if one has any feeling?" said Anna Pavlovna. "You are staying the +whole evening, I hope?" + +"And the fete at the English ambassador's? Today is Wednesday. I must +put in an appearance there," said the prince. "My daughter is coming for +me to take me there." + +"I thought today's fete had been canceled. I confess all these +festivities and fireworks are becoming wearisome." + +"If they had known that you wished it, the entertainment would have been +put off," said the prince, who, like a wound-up clock, by force of habit +said things he did not even wish to be believed. + +"Don't tease! Well, and what has been decided about Novosiltsev's +dispatch? You know everything." + +"What can one say about it?" replied the prince in a cold, listless +tone. "What has been decided? They have decided that Buonaparte has +burnt his boats, and I believe that we are ready to burn ours." diff --git a/test/integration/connectors/expected_results/sharepoint4/downloads/list-item-example.pdf b/test/integration/connectors/expected_results/sharepoint4/downloads/list-item-example.pdf new file mode 100644 index 000000000..c540662d4 Binary files /dev/null and b/test/integration/connectors/expected_results/sharepoint4/downloads/list-item-example.pdf differ diff --git a/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json b/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json new file mode 100644 index 000000000..49de32213 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "list-item-example.pdf", + "fullpath": "/list-item-example.pdf", + "rel_path": "list-item-example.pdf" + }, + "metadata": { + "url": "/drive/root:/list-item-example.pdf", + "version": "\"{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "/list-item-example.pdf" + }, + "date_created": "1738364795.0", + "date_modified": "1738364795.0", + "date_processed": "1739552999.147008", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", + "id": "0153RHRSABDC6JJIUHKJF2C5ZFGJF3C6XJ", + "name": "list-item-example.pdf", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/list-item-example.pdf", + "cTag": "\"c:{94BC1801-87A2-4B52-A177-25324BB17AE9},1\"", + "size": 48981 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpvcr4z9x8/list-item-example.pdf", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json b/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json new file mode 100644 index 000000000..e8d192b0d --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "book-war-and-peace-1p.txt", + "fullpath": "/book-war-and-peace-1p.txt", + "rel_path": "book-war-and-peace-1p.txt" + }, + "metadata": { + "url": "/drive/root:/book-war-and-peace-1p.txt", + "version": "\"{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "/book-war-and-peace-1p.txt" + }, + "date_created": "1738885678.0", + "date_modified": "1738885678.0", + "date_processed": "1739552993.9836009", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", + "id": "0153RHRSAVBNSXPKVIBZC3ZD53KENINVGG", + "name": "book-war-and-peace-1p.txt", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/book-war-and-peace-1p.txt", + "cTag": "\"c:{77650B15-A8AA-450E-BC8F-BB511A86D4C6},1\"", + "size": 3045 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpvcr4z9x8/book-war-and-peace-1p.txt", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json b/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json new file mode 100644 index 000000000..490e642d2 --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "fake-memo.pdf", + "fullpath": "Folder1/fake-memo.pdf", + "rel_path": "Folder1/fake-memo.pdf" + }, + "metadata": { + "url": "/drive/root:/Folder1/fake-memo.pdf", + "version": "\"{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "Folder1/fake-memo.pdf" + }, + "date_created": "1738364779.0", + "date_modified": "1738364779.0", + "date_processed": "1739553004.550522", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "id": "0153RHRSEXY3G5U3B5GBFY7TZKAY236XAL", + "name": "fake-memo.pdf", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/Folder1/fake-memo.pdf", + "cTag": "\"c:{DACDC697-3D6C-4B30-8FCF-2A0635BF5C0B},1\"", + "size": 13374 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpvcr4z9x8/Folder1/fake-memo.pdf", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json b/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json new file mode 100644 index 000000000..57935d73b --- /dev/null +++ b/test/integration/connectors/expected_results/sharepoint4/file_data/0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT.json @@ -0,0 +1,33 @@ +{ + "identifier": "0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT", + "connector_type": "sharepoint", + "source_identifiers": { + "filename": "fake-email.txt", + "fullpath": "Folder1/Folder2/fake-email.txt", + "rel_path": "Folder1/Folder2/fake-email.txt" + }, + "metadata": { + "url": "/drive/root:/Folder1/Folder2/fake-email.txt", + "version": "\"{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "Folder1/Folder2/fake-email.txt" + }, + "date_created": "1738364377.0", + "date_modified": "1738364377.0", + "date_processed": "1739553009.889132", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "eTag": "\"{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "id": "0153RHRSFFVW43WTVG5ZFY5WL7ZH3GUOGT", + "name": "fake-email.txt", + "webUrl": "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source/Shared%20Documents/Folder1/Folder2/fake-email.txt", + "cTag": "\"c:{BBB9ADA5-A64E-4BEE-8ED9-7FC9F66A38D3},1\"", + "size": 836 + }, + "reprocess": false, + "local_download_path": "/private/var/folders/85/f389rtdn2c971nv4r3d31d740000gn/T/tmpvcr4z9x8/Folder1/Folder2/fake-email.txt", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/test_sharepoint.py b/test/integration/connectors/test_sharepoint.py index 23428c4e8..b8da39403 100644 --- a/test/integration/connectors/test_sharepoint.py +++ b/test/integration/connectors/test_sharepoint.py @@ -19,24 +19,31 @@ ) +def sharepoint_config(): + class SharepointTestConfig: + def __init__(self): + self.client_id = os.environ["SHAREPOINT_CLIENT_ID"] + self.client_cred = os.environ["SHAREPOINT_CRED"] + self.user_pname = os.environ["MS_USER_PNAME"] + self.tenant = os.environ["MS_TENANT_ID"] + + return SharepointTestConfig() + + @pytest.mark.asyncio @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG) @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME") async def test_sharepoint_source(temp_dir): - # Retrieve environment variables site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source" - client_id = os.environ["SHAREPOINT_CLIENT_ID"] - client_cred = os.environ["SHAREPOINT_CRED"] - user_pname = os.environ["MS_USER_PNAME"] - tenant = os.environ["MS_TENANT_ID"] + config = sharepoint_config() # Create connection and indexer configurations - access_config = SharepointAccessConfig(client_cred=client_cred) + access_config = SharepointAccessConfig(client_cred=config.client_cred) connection_config = SharepointConnectionConfig( - client_id=client_id, + client_id=config.client_id, site=site, - tenant=tenant, - user_pname=user_pname, + tenant=config.tenant, + user_pname=config.user_pname, access_config=access_config, ) index_config = SharepointIndexerConfig(recursive=True) @@ -58,7 +65,151 @@ async def test_sharepoint_source(temp_dir): indexer=indexer, downloader=downloader, configs=SourceValidationConfigs( - test_id="sharepoint", + test_id="sharepoint1", + expected_num_files=4, + validate_downloaded_files=True, + exclude_fields_extend=[ + "metadata.date_created", + "metadata.date_modified", + "additional_metadata.LastModified", + "additional_metadata.@microsoft.graph.downloadUrl", + ], + ), + ) + + +@pytest.mark.asyncio +@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG) +@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME") +async def test_sharepoint_source_with_path(temp_dir): + site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source" + config = sharepoint_config() + + # Create connection and indexer configurations + access_config = SharepointAccessConfig(client_cred=config.client_cred) + connection_config = SharepointConnectionConfig( + client_id=config.client_id, + site=site, + tenant=config.tenant, + user_pname=config.user_pname, + access_config=access_config, + ) + index_config = SharepointIndexerConfig(recursive=True, path="Folder1") + + download_config = SharepointDownloaderConfig(download_dir=temp_dir) + + # Instantiate indexer and downloader + indexer = SharepointIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = SharepointDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + # Run the source connector validation + await source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="sharepoint2", + expected_num_files=2, + validate_downloaded_files=True, + exclude_fields_extend=[ + "metadata.date_created", + "metadata.date_modified", + "additional_metadata.LastModified", + "additional_metadata.@microsoft.graph.downloadUrl", + ], + ), + ) + + +@pytest.mark.asyncio +@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG) +@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME") +async def test_sharepoint_root_with_path(temp_dir): + site = "https://unstructuredio.sharepoint.com/" + config = sharepoint_config() + + # Create connection and indexer configurations + access_config = SharepointAccessConfig(client_cred=config.client_cred) + connection_config = SharepointConnectionConfig( + client_id=config.client_id, + site=site, + tenant=config.tenant, + user_pname=config.user_pname, + access_config=access_config, + ) + index_config = SharepointIndexerConfig(recursive=True, path="e2e-test-folder") + + download_config = SharepointDownloaderConfig(download_dir=temp_dir) + + # Instantiate indexer and downloader + indexer = SharepointIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = SharepointDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + # Run the source connector validation + await source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="sharepoint3", + expected_num_files=1, + validate_downloaded_files=True, + exclude_fields_extend=[ + "metadata.date_created", + "metadata.date_modified", + "additional_metadata.LastModified", + "additional_metadata.@microsoft.graph.downloadUrl", + ], + ), + ) + + +@pytest.mark.asyncio +@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG) +@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME") +async def test_sharepoint_shared_documents(temp_dir): + site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source" + config = sharepoint_config() + + # Create connection and indexer configurations + access_config = SharepointAccessConfig(client_cred=config.client_cred) + connection_config = SharepointConnectionConfig( + client_id=config.client_id, + site=site, + tenant=config.tenant, + user_pname=config.user_pname, + access_config=access_config, + ) + index_config = SharepointIndexerConfig(recursive=True, path="Shared Documents") + + download_config = SharepointDownloaderConfig(download_dir=temp_dir) + + # Instantiate indexer and downloader + indexer = SharepointIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = SharepointDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + # Run the source connector validation + await source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="sharepoint4", expected_num_files=4, validate_downloaded_files=True, exclude_fields_extend=[ diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index fa21553d1..51579011b 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.5.3" # pragma: no cover +__version__ = "0.5.4" # pragma: no cover diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index e0ad958f9..c6586dc6e 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -31,6 +31,7 @@ from office365.onedrive.driveitems.driveItem import DriveItem CONNECTOR_TYPE = "sharepoint" +LEGACY_DEFAULT_PATH = "Shared Documents" class SharepointAccessConfig(OnedriveAccessConfig): @@ -76,10 +77,14 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: except ClientRequestException: logger.info("Site not found") - drive_items = await self.list_objects( - folder=site_drive_item, recursive=self.index_config.recursive - ) - for drive_item in drive_items: + path = self.index_config.path + # Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not. + if path and path != LEGACY_DEFAULT_PATH: + site_drive_item = site_drive_item.get_by_path(path).get().execute_query() + + for drive_item in site_drive_item.get_files( + recursive=self.index_config.recursive + ).execute_query(): file_data = await self.drive_item_to_file_data(drive_item=drive_item) yield file_data