diff --git a/Pipfile.lock b/Pipfile.lock index d52d52b..bd7cc81 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -27,20 +27,20 @@ }, "boto3": { "hashes": [ - "sha256:c5cb2ada690c14e2dfa1e1c59ef7ef399c5e381f5514f1541d28310e35192300", - "sha256:eda49046c0f6a21ac159f9b2d609e5cc70d1dd019b7ac9618eec99285282b3db" + "sha256:4c9a62dcb5c3f905630fe99fb4b81131da84c5c92eedcc81a89cbd924c1c524f", + "sha256:9d6aad3fa8b90567006bf7b32efa26489fc306fbe63946eaf57b72356a45761d" ], "index": "pypi", "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "botocore": { "hashes": [ - "sha256:92128d56654342f026d5c20a92bf0e8b546be1eb38df2c0efc7433e8bbc39045", - "sha256:cc401b4836eae2a781efa1d1df88b2e92f9245885a6ae1bf9a6b26bc97b3efd2" + "sha256:7e4cf14bd5719b60600fb45d2bb3ae140feb3c182a863b93093aafce7f93cfee", + "sha256:b750b2de4a2478db9718a02395cb9da8698901ba02378d60037d6369ecb6bb88" ], "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "duckdb": { "hashes": [ @@ -419,11 +419,11 @@ }, "tzdata": { "hashes": [ - "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", - "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9" + "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", + "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7" ], "markers": "python_version >= '2'", - "version": "==2025.2" + "version": "==2025.3" }, "urllib3": { "hashes": [ @@ -486,39 +486,38 @@ }, "boto3": { "hashes": [ - "sha256:c5cb2ada690c14e2dfa1e1c59ef7ef399c5e381f5514f1541d28310e35192300", - "sha256:eda49046c0f6a21ac159f9b2d609e5cc70d1dd019b7ac9618eec99285282b3db" + "sha256:4c9a62dcb5c3f905630fe99fb4b81131da84c5c92eedcc81a89cbd924c1c524f", + "sha256:9d6aad3fa8b90567006bf7b32efa26489fc306fbe63946eaf57b72356a45761d" ], - "index": "pypi", "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "boto3-stubs": { "extras": [ "essential" ], "hashes": [ - "sha256:2d3f9e10686ca7ca054b788450fcc3d505b9f74954817afd994f71742df6d883", - "sha256:e4f153103a3e67a50aa98b9141382a16d37e943aef957319fe3a48e9b4a33a5a" + "sha256:2683835a105262e1d7404f638383c230d58178a31ffcd88ad70db941a8274427", + "sha256:de769f1e414ce4a69fa2f79c21054242501c27fbe964c69cde797fcc18e681c1" ], "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "botocore": { "hashes": [ - "sha256:92128d56654342f026d5c20a92bf0e8b546be1eb38df2c0efc7433e8bbc39045", - "sha256:cc401b4836eae2a781efa1d1df88b2e92f9245885a6ae1bf9a6b26bc97b3efd2" + "sha256:7e4cf14bd5719b60600fb45d2bb3ae140feb3c182a863b93093aafce7f93cfee", + "sha256:b750b2de4a2478db9718a02395cb9da8698901ba02378d60037d6369ecb6bb88" ], "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "botocore-stubs": { "hashes": [ - "sha256:951fc41e78e1a53b49bfe8511bc24e430eda4c689ca6033c643693e56485e69c", - "sha256:d5c9ac851e8d7ce30d25204add0e2448cb23cb5f8fbe6b5c768ea1e4471b4455" + "sha256:7a1d5749a5088fa3184add28efad1e6548039b40d8d00413c69021c0c31501ee", + "sha256:d565f01c31034e7fd86193c0f4937179a67a971098947c0d1e83d81973a9ec4c" ], "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "cachecontrol": { "extras": [ @@ -981,11 +980,11 @@ }, "filelock": { "hashes": [ - "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", - "sha256:711e943b4ec6be42e1d4e6690b48dc175c822967466bb31c0c293f34334c13f4" + "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", + "sha256:b8360948b351b80f420878d8516519a2204b07aefcdcfd24912a5d33127f188c" ], "markers": "python_version >= '3.10'", - "version": "==3.20.0" + "version": "==3.20.1" }, "identify": { "hashes": [ @@ -1054,85 +1053,85 @@ }, "librt": { "hashes": [ - "sha256:020c6db391268bcc8ce75105cb572df8cb659a43fd347366aaa407c366e5117a", - "sha256:0fa9ac2e49a6bee56e47573a6786cb635e128a7b12a0dc7851090037c0d397a3", - "sha256:11ad45122bbed42cfc8b0597450660126ef28fd2d9ae1a219bc5af8406f95678", - "sha256:120dd21d46ff875e849f1aae19346223cf15656be489242fe884036b23d39e93", - "sha256:14569ac5dd38cfccf0a14597a88038fb16811a6fede25c67b79c6d50fc2c8fdc", - "sha256:1617bea5ab31266e152871208502ee943cb349c224846928a1173c864261375e", - "sha256:170cdb8436188347af17bf9cccf3249ba581c933ed56d926497119d4cf730cec", - "sha256:1975eda520957c6e0eb52d12968dd3609ffb7eef05d4223d097893d6daf1d8a7", - "sha256:1fe603877e1865b5fd047a5e40379509a4a60204aa7aa0f72b16f7a41c3f0712", - "sha256:24d70810f6e2ea853ff79338001533716b373cc0f63e2a0be5bc96129edb5fb5", - "sha256:256793988bff98040de23c57cf36e1f4c2f2dc3dcd17537cdac031d3b681db71", - "sha256:25711f364c64cab2c910a0247e90b51421e45dbc8910ceeb4eac97a9e132fc6f", - "sha256:2682162855a708e3270eba4b92026b93f8257c3e65278b456c77631faf0f4f7a", - "sha256:2cf9d73499486ce39eebbff5f42452518cc1f88d8b7ea4a711ab32962b176ee2", - "sha256:2e40520c37926166c24d0c2e0f3bc3a5f46646c34bdf7b4ea9747c297d6ee809", - "sha256:2e980cf1ed1a2420a6424e2ed884629cdead291686f1048810a817de07b5eb18", - "sha256:2f03484b54bf4ae80ab2e504a8d99d20d551bfe64a7ec91e218010b467d77093", - "sha256:35f1609e3484a649bb80431310ddbec81114cd86648f1d9482bc72a3b86ded2e", - "sha256:399938edbd3d78339f797d685142dd8a623dfaded023cf451033c85955e4838a", - "sha256:399bbd7bcc1633c3e356ae274a1deb8781c7bf84d9c7962cc1ae0c6e87837292", - "sha256:3ec50cf65235ff5c02c5b747748d9222e564ad48597122a361269dd3aa808798", - "sha256:3edbf257c40d21a42615e9e332a6b10a8bacaaf58250aed8552a14a70efd0d65", - "sha256:440c788f707c061d237c1e83edf6164ff19f5c0f823a3bf054e88804ebf971ec", - "sha256:44b3689b040df57f492e02cd4f0bacd1b42c5400e4b8048160c9d5e866de8abe", - "sha256:4887c29cadbdc50640179e3861c276325ff2986791e6044f73136e6e798ff806", - "sha256:5460d99ed30f043595bbdc888f542bad2caeb6226b01c33cda3ae444e8f82d42", - "sha256:550fdbfbf5bba6a2960b27376ca76d6aaa2bd4b1a06c4255edd8520c306fcfc0", - "sha256:56f2a47beda8409061bc1c865bef2d4bd9ff9255219402c0817e68ab5ad89aed", - "sha256:572a24fc5958c61431da456a0ef1eeea6b4989d81eeb18b8e5f1f3077592200b", - "sha256:59cb0470612d21fa1efddfa0dd710756b50d9c7fb6c1236bbf8ef8529331dc70", - "sha256:6038ccbd5968325a5d6fd393cf6e00b622a8de545f0994b89dd0f748dcf3e19e", - "sha256:6488e69d408b492e08bfb68f20c4a899a354b4386a446ecd490baff8d0862720", - "sha256:687403cced6a29590e6be6964463835315905221d797bc5c934a98750fe1a9af", - "sha256:6b407c23f16ccc36614c136251d6b32bf30de7a57f8e782378f1107be008ddb0", - "sha256:6b4e7bff1d76dd2b46443078519dc75df1b5e01562345f0bb740cea5266d8218", - "sha256:6bdd9adfca615903578d2060ee8a6eb1c24eaf54919ff0ddc820118e5718931b", - "sha256:6eb9295c730e26b849ed1f4022735f36863eb46b14b6e10604c1c39b8b5efaea", - "sha256:703456146dc2bf430f7832fd1341adac5c893ec3c1430194fdcefba00012555c", - "sha256:754a0d09997095ad764ccef050dd5bf26cbf457aab9effcba5890dad081d879e", - "sha256:7af7785f5edd1f418da09a8cdb9ec84b0213e23d597413e06525340bcce1ea4f", - "sha256:7b29e97273bd6999e2bfe9fe3531b1f4f64effd28327bced048a33e49b99674a", - "sha256:7b4f57f7a0c65821c5441d98c47ff7c01d359b1e12328219709bdd97fdd37f90", - "sha256:8837d5a52a2d7aa9f4c3220a8484013aed1d8ad75240d9a75ede63709ef89055", - "sha256:8ccadf260bb46a61b9c7e89e2218f6efea9f3eeaaab4e3d1f58571890e54858e", - "sha256:8d8cf653e798ee4c4e654062b633db36984a1572f68c3aa25e364a0ddfbbb910", - "sha256:93b2a1f325fefa1482516ced160c8c7b4b8d53226763fa6c93d151fa25164207", - "sha256:9f0e0927efe87cd42ad600628e595a1a0aa1c64f6d0b55f7e6059079a428641a", - "sha256:a59a69deeb458c858b8fea6acf9e2acd5d755d76cd81a655256bc65c20dfff5b", - "sha256:a9f9b661f82693eb56beb0605156c7fca57f535704ab91837405913417d6990b", - "sha256:abfc57cab3c53c4546aee31859ef06753bfc136c9d208129bad23e2eca39155a", - "sha256:aca73d70c3f553552ba9133d4a09e767dcfeee352d8d8d3eb3f77e38a3beb3ed", - "sha256:adeaa886d607fb02563c1f625cf2ee58778a2567c0c109378da8f17ec3076ad7", - "sha256:b278a9248a4e3260fee3db7613772ca9ab6763a129d6d6f29555e2f9b168216d", - "sha256:b7c1239b64b70be7759554ad1a86288220bbb04d68518b527783c4ad3fb4f80b", - "sha256:bf8c7735fbfc0754111f00edda35cf9e98a8d478de6c47b04eaa9cef4300eaa7", - "sha256:c634a0a6db395fdaba0361aa78395597ee72c3aad651b9a307a3a7eaf5efd67e", - "sha256:cad9971881e4fec00d96af7eaf4b63aa7a595696fc221808b0d3ce7ca9743258", - "sha256:cbdb3f337c88b43c3b49ca377731912c101178be91cb5071aac48faa898e6f8e", - "sha256:cd8551aa21df6c60baa2624fd086ae7486bdde00c44097b32e1d1b1966e365e0", - "sha256:d09f677693328503c9e492e33e9601464297c01f9ebd966ea8fc5308f3069bfd", - "sha256:d376a35c6561e81d2590506804b428fc1075fcc6298fc5bb49b771534c0ba010", - "sha256:d39079379a9a28e74f4d57dc6357fa310a1977b51ff12239d7271ec7e71d67f5", - "sha256:d86f94743a11873317094326456b23f8a5788bad9161fd2f0e52088c33564620", - "sha256:d91e60ac44bbe3a77a67af4a4c13114cbe9f6d540337ce22f2c9eaf7454ca71f", - "sha256:d9883b2d819ce83f87ba82a746c81d14ada78784db431e57cc9719179847376e", - "sha256:e094e445c37c57e9ec612847812c301840239d34ccc5d153a982fa9814478c60", - "sha256:e19acfde38cb532a560b98f473adc741c941b7a9bc90f7294bc273d08becb58b", - "sha256:e32d43610dff472eab939f4d7fbdd240d1667794192690433672ae22d7af8445", - "sha256:ed028fc3d41adda916320712838aec289956c89b4f0a361ceadf83a53b4c047a", - "sha256:ef59c938f72bdbc6ab52dc50f81d0637fde0f194b02d636987cea2ab30f8f55a", - "sha256:f3d4801db8354436fd3936531e7f0e4feb411f62433a6b6cb32bb416e20b529f", - "sha256:f57aca20e637750a2c18d979f7096e2c2033cc40cf7ed201494318de1182f135", - "sha256:f9da128d0edf990cf0d2ca011b02cd6f639e79286774bd5b0351245cbb5a6e51", - "sha256:fbd7351d43b80d9c64c3cfcb50008f786cc82cba0450e8599fdd64f264320bd3", - "sha256:fcb72249ac4ea81a7baefcbff74df7029c3cb1cf01a711113fa052d563639c9c", - "sha256:ff21c554304e8226bf80c3a7754be27c6c3549a9fec563a03c06ee8f494da8fc" + "sha256:022cc673e69283a42621dd453e2407cf1647e77f8bd857d7ad7499901e62376f", + "sha256:02a69369862099e37d00765583052a99d6a68af7e19b887e1b78fee0146b755a", + "sha256:037f5cb6fe5abe23f1dc058054d50e9699fcc90d0677eee4e4f74a8677636a1a", + "sha256:064a286e6ab0b4c900e228ab4fa9cb3811b4b83d3e0cc5cd816b2d0f548cb61c", + "sha256:078ae52ffb3f036396cc4aed558e5b61faedd504a3c1f62b8ae34bf95ae39d94", + "sha256:07c4d7c9305e75a0edd3427b79c7bd1d019cd7eddaa7c89dbb10e0c7946bffbb", + "sha256:0e8f864b521f6cfedb314d171630f827efee08f5c3462bcbc2244ab8e1768cd6", + "sha256:0f8cac84196d0ffcadf8469d9ded4d4e3a8b1c666095c2a291e22bf58e1e8a9f", + "sha256:0fd766bb9ace3498f6b93d32f30c0e7c8ce6b727fecbc84d28160e217bb66254", + "sha256:114722f35093da080a333b3834fff04ef43147577ed99dd4db574b03a5f7d170", + "sha256:1437c3f72a30c7047f16fd3e972ea58b90172c3c6ca309645c1c68984f05526a", + "sha256:188b4b1a770f7f95ea035d5bbb9d7367248fc9d12321deef78a269ebf46a5729", + "sha256:1b668b1c840183e4e38ed5a99f62fac44c3a3eef16870f7f17cfdfb8b47550ed", + "sha256:1c4c89fb01157dd0a3bfe9e75cd6253b0a1678922befcd664eca0772a4c6c979", + "sha256:1ef704e01cb6ad39ad7af668d51677557ca7e5d377663286f0ee1b6b27c28e5f", + "sha256:21ea710e96c1e050635700695095962a22ea420d4b3755a25e4909f2172b4ff2", + "sha256:25cc40d8eb63f0a7ea4c8f49f524989b9df901969cb860a2bc0e4bad4b8cb8a8", + "sha256:2857c875f1edd1feef3c371fbf830a61b632fb4d1e57160bb1e6a3206e6abe67", + "sha256:28f990e6821204f516d09dc39966ef8b84556ffd648d5926c9a3f681e8de8906", + "sha256:2b3ca211ae8ea540569e9c513da052699b7b06928dcda61247cb4f318122bdb5", + "sha256:2e734c2c54423c6dcc77f58a8585ba83b9f72e422f9edf09cab1096d4a4bdc82", + "sha256:3485b9bb7dfa66167d5500ffdafdc35415b45f0da06c75eb7df131f3357b174a", + "sha256:3749ef74c170809e6dee68addec9d2458700a8de703de081c888e92a8b015cf9", + "sha256:3871af56c59864d5fd21d1ac001eb2fb3b140d52ba0454720f2e4a19812404ba", + "sha256:39003fc73f925e684f8521b2dbf34f61a5deb8a20a15dcf53e0d823190ce8848", + "sha256:3ca1caedf8331d8ad6027f93b52d68ed8f8009f5c420c246a46fe9d3be06be0f", + "sha256:419eea245e7ec0fe664eb7e85e7ff97dcdb2513ca4f6b45a8ec4a3346904f95a", + "sha256:42da201c47c77b6cc91fc17e0e2b330154428d35d6024f3278aa2683e7e2daf2", + "sha256:43a2515a33f2bc17b15f7fb49ff6426e49cb1d5b2539bc7f8126b9c5c7f37164", + "sha256:4450c354b89dbb266730893862dbff06006c9ed5b06b6016d529b2bf644fc681", + "sha256:4df7c9def4fc619a9c2ab402d73a0c5b53899abe090e0100323b13ccb5a3dd82", + "sha256:4f1ee004942eaaed6e06c087d93ebc1c67e9a293e5f6b9b5da558df6bf23dc5d", + "sha256:52e34c6af84e12921748c8354aa6acf1912ca98ba60cdaa6920e34793f1a0788", + "sha256:543c42fa242faae0466fe72d297976f3c710a357a219b1efde3a0539a68a6997", + "sha256:5a72b905420c4bb2c10c87b5c09fe6faf4a76d64730e3802feef255e43dfbf5a", + "sha256:618b7459bb392bdf373f2327e477597fff8f9e6a1878fffc1b711c013d1b0da4", + "sha256:6bb15ee29d95875ad697d449fe6071b67f730f15a6961913a2b0205015ca0843", + "sha256:6fc4aa67fedd827a601f97f0e61cc72711d0a9165f2c518e9a7c38fc1568b9ad", + "sha256:70969229cb23d9c1a80e14225838d56e464dc71fa34c8342c954fc50e7516dee", + "sha256:71a56f4671f7ff723451f26a6131754d7c1809e04e22ebfbac1db8c9e6767a20", + "sha256:721a7b125a817d60bf4924e1eec2a7867bfcf64cfc333045de1df7a0629e4481", + "sha256:76b2ba71265c0102d11458879b4d53ccd0b32b0164d14deb8d2b598a018e502f", + "sha256:772e18696cf5a64afee908662fbcb1f907460ddc851336ee3a848ef7684c8e1e", + "sha256:7766b57aeebaf3f1dac14fdd4a75c9a61f2ed56d8ebeefe4189db1cb9d2a3783", + "sha256:776dbb9bfa0fc5ce64234b446995d8d9f04badf64f544ca036bd6cff6f0732ce", + "sha256:77772a4b8b5f77d47d883846928c36d730b6e612a6388c74cba33ad9eb149c11", + "sha256:7dd3b5c37e0fb6666c27cf4e2c88ae43da904f2155c4cfc1e5a2fdce3b9fcf92", + "sha256:7e4b5ffa1614ad4f32237d739699be444be28de95071bfa4e66a8da9fa777798", + "sha256:8a461f6456981d8c8e971ff5a55f2e34f4e60871e665d2f5fde23ee74dea4eeb", + "sha256:95cb80854a355b284c55f79674f6187cc9574df4dc362524e0cce98c89ee8331", + "sha256:a34ae11315d4e26326aaf04e21ccd8d9b7de983635fba38d73e203a9c8e3fe3d", + "sha256:a4f7339d9e445280f23d63dea842c0c77379c4a47471c538fc8feedab9d8d063", + "sha256:a5deebb53d7a4d7e2e758a96befcd8edaaca0633ae71857995a0f16033289e44", + "sha256:a9c5de1928c486201b23ed0cc4ac92e6e07be5cd7f3abc57c88a9cf4f0f32108", + "sha256:adefe0d48ad35b90b6f361f6ff5a1bd95af80c17d18619c093c60a20e7a5b60c", + "sha256:b35c63f557653c05b5b1b6559a074dbabe0afee28ee2a05b6c9ba21ad0d16a74", + "sha256:b370a77be0a16e1ad0270822c12c21462dc40496e891d3b0caf1617c8cc57e20", + "sha256:b4c25312c7f4e6ab35ab16211bdf819e6e4eddcba3b2ea632fb51c9a2a97e105", + "sha256:b719c8730c02a606dc0e8413287e8e94ac2d32a51153b300baf1f62347858fba", + "sha256:bc4aebecc79781a1b77d7d4e7d9fe080385a439e198d993b557b60f9117addaf", + "sha256:c2a6f1236151e6fe1da289351b5b5bce49651c91554ecc7b70a947bced6fe212", + "sha256:c66c2b245926ec15188aead25d395091cb5c9df008d3b3207268cd65557d6286", + "sha256:c96cb76f055b33308f6858b9b594618f1b46e147a4d03a4d7f0c449e304b9b95", + "sha256:c9cab4b3de1f55e6c30a84c8cee20e4d3b2476f4d547256694a1b0163da4fe32", + "sha256:ce1b44091355b68cffd16e2abac07c1cafa953fa935852d3a4dd8975044ca3bf", + "sha256:ce58420e25097b2fc201aef9b9f6d65df1eb8438e51154e1a7feb8847e4a55ab", + "sha256:d05acd46b9a52087bfc50c59dfdf96a2c480a601e8898a44821c7fd676598f74", + "sha256:d31acb5886c16ae1711741f22504195af46edec8315fe69b77e477682a87a83e", + "sha256:d44a1b1ba44cbd2fc3cb77992bef6d6fdb1028849824e1dd5e4d746e1f7f7f0b", + "sha256:d854c6dc0f689bad7ed452d2a3ecff58029d80612d336a45b62c35e917f42d23", + "sha256:dc300cb5a5a01947b1ee8099233156fdccd5001739e5f596ecfbc0dab07b5a3b", + "sha256:e710c983d29d9cc4da29113b323647db286eaf384746344f4a233708cca1a82c", + "sha256:ec72342cc4d62f38b25a94e28b9efefce41839aecdecf5e9627473ed04b7be16", + "sha256:ee8d3323d921e0f6919918a97f9b5445a7dfe647270b2629ec1008aa676c0bc0", + "sha256:f79bc3595b6ed159a1bf0cdc70ed6ebec393a874565cab7088a219cca14da727", + "sha256:f7fa8beef580091c02b4fd26542de046b2abfe0aaefa02e8bcf68acb7618f2b3" ], "markers": "python_version >= '3.9'", - "version": "==0.7.3" + "version": "==0.7.4" }, "license-expression": { "hashes": [ @@ -1340,48 +1339,48 @@ }, "mypy": { "hashes": [ - "sha256:0c01c99d626380752e527d5ce8e69ffbba2046eb8a060db0329690849cf9b6f9", - "sha256:0dde5cb375cb94deff0d4b548b993bec52859d1651e073d63a1386d392a95495", - "sha256:0e3c3d1e1d62e678c339e7ade72746a9e0325de42cd2cccc51616c7b2ed1a018", - "sha256:0ea4fd21bb48f0da49e6d3b37ef6bd7e8228b9fe41bbf4d80d9364d11adbd43c", - "sha256:0fb3115cb8fa7c5f887c8a8d81ccdcb94cff334684980d847e5a62e926910e1d", - "sha256:11f7254c15ab3f8ed68f8e8f5cbe88757848df793e31c36aaa4d4f9783fd08ab", - "sha256:120cffe120cca5c23c03c77f84abc0c14c5d2e03736f6c312480020082f1994b", - "sha256:16f76ff3f3fd8137aadf593cb4607d82634fca675e8211ad75c43d86033ee6c6", - "sha256:1cf9c59398db1c68a134b0b5354a09a1e124523f00bacd68e553b8bd16ff3299", - "sha256:318ba74f75899b0e78b847d8c50821e4c9637c79d9a59680fc1259f29338cb3e", - "sha256:3210d87b30e6af9c8faed61be2642fcbe60ef77cec64fa1ef810a630a4cf671c", - "sha256:34ec1ac66d31644f194b7c163d7f8b8434f1b49719d403a5d26c87fff7e913f7", - "sha256:37af5166f9475872034b56c5efdcf65ee25394e9e1d172907b84577120714364", - "sha256:3ad925b14a0bb99821ff6f734553294aa6a3440a8cb082fe1f5b84dfb662afb1", - "sha256:510c014b722308c9bd377993bcbf9a07d7e0692e5fa8fc70e639c1eb19fc6bee", - "sha256:6016c52ab209919b46169651b362068f632efcd5eb8ef9d1735f6f86da7853b2", - "sha256:6148ede033982a8c5ca1143de34c71836a09f105068aaa8b7d5edab2b053e6c8", - "sha256:63ea6a00e4bd6822adbfc75b02ab3653a17c02c4347f5bb0cf1d5b9df3a05835", - "sha256:7686ed65dbabd24d20066f3115018d2dce030d8fa9db01aa9f0a59b6813e9f9e", - "sha256:7a500ab5c444268a70565e374fc803972bfd1f09545b13418a5174e29883dab7", - "sha256:8f44f2ae3c58421ee05fe609160343c25f70e3967f6e32792b5a78006a9d850f", - "sha256:a18d8abdda14035c5718acb748faec09571432811af129bf0d9e7b2d6699bf18", - "sha256:a31e4c28e8ddb042c84c5e977e28a21195d086aaffaf08b016b78e19c9ef8106", - "sha256:a9ac09e52bb0f7fb912f5d2a783345c72441a08ef56ce3e17c1752af36340a39", - "sha256:b9d491295825182fba01b6ffe2c6fe4e5a49dbf4e2bb4d1217b6ced3b4797bc6", - "sha256:c14a98bc63fd867530e8ec82f217dae29d0550c86e70debc9667fff1ec83284e", - "sha256:c3385246593ac2b97f155a0e9639be906e73534630f663747c71908dfbf26134", - "sha256:cabbee74f29aa9cd3b444ec2f1e4fa5a9d0d746ce7567a6a609e224429781f53", - "sha256:cb64b0ba5980466a0f3f9990d1c582bcab8db12e29815ecb57f1408d99b4bff7", - "sha256:cf7d84f497f78b682edd407f14a7b6e1a2212b433eedb054e2081380b7395aa3", - "sha256:e2c1101ab41d01303103ab6ef82cbbfedb81c1a060c868fa7cc013d573d37ab5", - "sha256:f188dcf16483b3e59f9278c4ed939ec0254aa8a60e8fc100648d9ab5ee95a431", - "sha256:f2e36bed3c6d9b5f35d28b63ca4b727cb0228e480826ffc8953d1892ddc8999d", - "sha256:f3e19e3b897562276bb331074d64c076dbdd3e79213f36eed4e592272dabd760", - "sha256:f6b874ca77f733222641e5c46e4711648c4037ea13646fd0cdc814c2eaec2528", - "sha256:f75e60aca3723a23511948539b0d7ed514dda194bc3755eae0bfc7a6b4887aa7", - "sha256:fc51a5b864f73a3a182584b1ac75c404396a17eced54341629d8bdcb644a5bba", - "sha256:fd4a985b2e32f23bead72e2fb4bbe5d6aceee176be471243bd831d5b2644672d" + "sha256:016f2246209095e8eda7538944daa1d60e1e8134d98983b9fc1e92c1fc0cb8dd", + "sha256:022ea7279374af1a5d78dfcab853fe6a536eebfda4b59deab53cd21f6cd9f00b", + "sha256:06e6170bd5836770e8104c8fdd58e5e725cfeb309f0a6c681a811f557e97eac1", + "sha256:19d88bb05303fe63f71dd2c6270daca27cb9401c4ca8255fe50d1d920e0eb9ba", + "sha256:21761006a7f497cb0d4de3d8ef4ca70532256688b0523eee02baf9eec895e27b", + "sha256:28902ee51f12e0f19e1e16fbe2f8f06b6637f482c459dd393efddd0ec7f82045", + "sha256:2899753e2f61e571b3971747e302d5f420c3fd09650e1951e99f823bc3089dac", + "sha256:2abb24cf3f17864770d18d673c85235ba52456b36a06b6afc1e07c1fdcd3d0e6", + "sha256:34c81968774648ab5ac09c29a375fdede03ba253f8f8287847bd480782f73a6a", + "sha256:409088884802d511ee52ca067707b90c883426bd95514e8cfda8281dc2effe24", + "sha256:481daf36a4c443332e2ae9c137dfee878fcea781a2e3f895d54bd3002a900957", + "sha256:4b84a7a18f41e167f7995200a1d07a4a6810e89d29859df936f1c3923d263042", + "sha256:4f28f99c824ecebcdaa2e55d82953e38ff60ee5ec938476796636b86afa3956e", + "sha256:5f05aa3d375b385734388e844bc01733bd33c644ab48e9684faa54e5389775ec", + "sha256:7bcfc336a03a1aaa26dfce9fff3e287a3ba99872a157561cbfcebe67c13308e3", + "sha256:804bd67b8054a85447c8954215a906d6eff9cabeabe493fb6334b24f4bfff718", + "sha256:8bb5c6f6d043655e055be9b542aa5f3bdd30e4f3589163e85f93f3640060509f", + "sha256:a009ffa5a621762d0c926a078c2d639104becab69e79538a494bcccb62cc0331", + "sha256:a8174a03289288c1f6c46d55cef02379b478bfbc8e358e02047487cad44c6ca1", + "sha256:ab43590f9cd5108f41aacf9fca31841142c786827a74ab7cc8a2eacb634e09a1", + "sha256:b10e7c2cd7870ba4ad9b2d8a6102eb5ffc1f16ca35e3de6bfa390c1113029d13", + "sha256:b13cfdd6c87fc3efb69ea4ec18ef79c74c3f98b4e5498ca9b85ab3b2c2329a67", + "sha256:b64d987153888790bcdb03a6473d321820597ab8dd9243b27a92153c4fa50fd2", + "sha256:b7951a701c07ea584c4fe327834b92a30825514c868b1f69c30445093fdd9d5a", + "sha256:bdb12f69bcc02700c2b47e070238f42cb87f18c0bc1fc4cdb4fb2bc5fd7a3b8b", + "sha256:c35d298c2c4bba75feb2195655dfea8124d855dfd7343bf8b8c055421eaf0cf8", + "sha256:c608937067d2fc5a4dd1a5ce92fd9e1398691b8c5d012d66e1ddd430e9244376", + "sha256:c9a6538e0415310aad77cb94004ca6482330fece18036b5f360b62c45814c4ef", + "sha256:d8dfc6ab58ca7dda47d9237349157500468e404b17213d44fc1cb77bce532288", + "sha256:da4869fc5e7f62a88f3fe0b5c919d1d9f7ea3cef92d3689de2823fd27e40aa75", + "sha256:de759aafbae8763283b2ee5869c7255391fbc4de3ff171f8f030b5ec48381b74", + "sha256:e3157c7594ff2ef1634ee058aafc56a82db665c9438fd41b390f3bde1ab12250", + "sha256:e3f276d8493c3c97930e354b2595a44a21348b320d859fb4a2b9f66da9ed27ab", + "sha256:ee4c11e460685c3e0c64a4c5de82ae143622410950d6be863303a1c4ba0e36d6", + "sha256:f1235f5ea01b7db5468d53ece6aaddf1ad0b88d9e7462b86ef96fe04995d7247", + "sha256:f7cee03c9a2e2ee26ec07479f38ea9c884e301d42c6d43a19d20fb014e3ba925", + "sha256:f859fb09d9583a985be9a493d5cfc5515b56b08f7447759a0c5deaf68d80506e", + "sha256:ffcebe56eb09ff0c0885e750036a095e23793ba6c2e894e7e63f6d89ad51f22e" ], "index": "pypi", "markers": "python_version >= '3.9'", - "version": "==1.19.0" + "version": "==1.19.1" }, "mypy-boto3-cloudformation": { "hashes": [ @@ -1401,19 +1400,19 @@ }, "mypy-boto3-ec2": { "hashes": [ - "sha256:1b54eaa6403c10677496f7dccdd0c2911533b1f26e4a5732af56fbd31141796f", - "sha256:bc89dcbd7057bd58fd4dc9956ae4d581c35c908c4920318d954ec41fe507ee37" + "sha256:a7e392906fac6a3aeb78edd990f9781de435f437be376bebe9e137a44143f73f", + "sha256:d86406842e8fdeb3dda2097372bc51ecddebf3903e206b9ffb7472ee2660d2b8" ], "markers": "python_version >= '3.9'", - "version": "==1.42.5" + "version": "==1.42.13" }, "mypy-boto3-lambda": { "hashes": [ - "sha256:12e810b4c7d37be3c4d83e6c6bf638da4f56ad191495ddbeb589929f2270f9f0", - "sha256:c1ecd68b1c2fa89ff18d32cbf3b12cdbb078b8a5bbbc95da80f9a61bfcd1229d" + "sha256:55deadbfaf0e5f118237831a84d35f48dc7164ce2bf7efdcb54f54aef4025602", + "sha256:fbb6646138520c675a4c4adff334e830b010d5c077dee8d5187346809ebb6f72" ], "markers": "python_version >= '3.9'", - "version": "==1.42.3" + "version": "==1.42.8" }, "mypy-boto3-rds": { "hashes": [ @@ -1425,11 +1424,11 @@ }, "mypy-boto3-s3": { "hashes": [ - "sha256:2507bdfa17829f1f422b8bf334db836689b0529fbe6635af4e0f3aaa72f92806", - "sha256:97c5171928a2ae8c7b60a60700f395407cb5eca60704c7ab6a1ff0861f4db997" + "sha256:9a4575124b500c29c023919f17b022e66109a56ba2318ef8aeab3d0dd2cd174e", + "sha256:e5f6fb51f215b30255ee076712032c6810b274a20062d5fa2ecd7816ac1a1274" ], "markers": "python_version >= '3.9'", - "version": "==1.42.3" + "version": "==1.42.10" }, "mypy-boto3-sqs": { "hashes": [ @@ -1635,12 +1634,12 @@ }, "pre-commit": { "hashes": [ - "sha256:25e2ce09595174d9c97860a95609f9f852c0614ba602de3561e267547f2335e1", - "sha256:dc5a065e932b19fc1d4c653c6939068fe54325af8e741e74e88db4d28a4dd66b" + "sha256:3b3afd891e97337708c1674210f8eba659b52a38ea5f822ff142d10786221f77", + "sha256:eb545fcff725875197837263e977ea257a402056661f09dae08e4b149b030a61" ], "index": "pypi", "markers": "python_version >= '3.10'", - "version": "==4.5.0" + "version": "==4.5.1" }, "prompt-toolkit": { "hashes": [ @@ -1725,18 +1724,17 @@ "sha256:f7fe3dbe871294ba70d789be16b6e7e52b418311e166e0e3cba9522f0f437fb1", "sha256:f963ba8c3b0199f9d6b794c90ec77545e05eadc83973897a4523c9e8d84e9340" ], - "index": "pypi", "markers": "python_version >= '3.10'", "version": "==22.0.0" }, "pyarrow-stubs": { "hashes": [ - "sha256:a53793149bcbf41670acb8cd843645e3bf0469e4aa035824adda61c48fa900c6", - "sha256:eab02b02d4d74d86619b7f7b9fe6e7ddfe425cedf8c31aa4c7fd33cdb3b189be" + "sha256:0634e70388cd23e7c78e2abbb1989822edc34df2d2ff4fd50a2316dd0cdafd9f", + "sha256:92c1fda4998f0c13e608d8abc7e4b8537e3ef108f6bf42c58e5af97e7d143e75" ], "index": "pypi", "markers": "python_version >= '3.9' and python_version < '4'", - "version": "==20.0.0.20251209" + "version": "==20.0.0.20251215" }, "pycparser": { "hashes": [ @@ -1901,29 +1899,29 @@ }, "ruff": { "hashes": [ - "sha256:15f04cb45c051159baebb0f0037f404f1dc2f15a927418f29730f411a79bc4e7", - "sha256:1af35c2d62633d4da0521178e8a2641c636d2a7153da0bac1b30cfd4ccd91344", - "sha256:1d62cb310c4fbcb9ee4ac023fe17f984ae1e12b8a4a02e3d21489f9a2a5f730c", - "sha256:21d48fa744c9d1cb8d71eb0a740c4dd02751a5de9db9a730a8ef75ca34cf138e", - "sha256:25add4575ffecc53d60eed3f24b1e934493631b48ebbc6ebaf9d8517924aca4b", - "sha256:2c87e09b3cd9d126fc67a9ecd3b5b1d3ded2b9c7fce3f16e315346b9d05cfb52", - "sha256:2e2fcbefe91f9fad0916850edf0854530c15bd1926b6b779de47e9ab619ea38f", - "sha256:4c943d847b7f02f7db4201a0600ea7d244d8a404fbb639b439e987edcf2baf9a", - "sha256:774ed0dd87d6ce925e3b8496feb3a00ac564bea52b9feb551ecd17e0a23d1eed", - "sha256:7aaf2974f378e6b01d1e257c6948207aec6a9b5ba53fab23d0182efb887a0e4a", - "sha256:8cdb162a7159f4ca36ce980a18c43d8f036966e7f73f866ac8f493b75e0c27e9", - "sha256:965a582c93c63fe715fd3e3f8aa37c4b776777203d8e1d8aa3cc0c14424a4b99", - "sha256:9eeb0b24242b5bbff3011409a739929f497f3fb5fe3b5698aba5e77e8c833097", - "sha256:a9d70721066a296f45786ec31916dc287b44040f553da21564de0ab4d45a869b", - "sha256:cb6e8bf7b4f627548daa1b69283dac5a296bfe9ce856703b03130732e20ddfe2", - "sha256:e5758ca513c43ad8a4ef13f0f081f80f08008f410790f3611a21a92421ab045b", - "sha256:ec071e9c82eca417f6111fd39f7043acb53cd3fde9b1f95bbed745962e345afb", - "sha256:eed28f6fafcc9591994c42254f5a5c5ca40e69a30721d2ab18bb0bb3baac3ab6", - "sha256:f74f7ba163b6e85a8d81a590363bf71618847e5078d90827749bfda1d88c9cdf" + "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6", + "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405", + "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a", + "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f", + "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154", + "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e", + "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f", + "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830", + "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f", + "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77", + "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f", + "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49", + "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4", + "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d", + "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935", + "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60", + "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d", + "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6", + "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d" ], "index": "pypi", "markers": "python_version >= '3.7'", - "version": "==0.14.8" + "version": "==0.14.10" }, "s3transfer": { "hashes": [ @@ -2030,11 +2028,11 @@ }, "types-awscrt": { "hashes": [ - "sha256:3f5d1e6c99b0b551af6365f9c04d8ce2effbcfe18bb719a34501efea279ae7bb", - "sha256:41e01e14d646877bd310e7e3c49ff193f8361480b9568e97b1639775009bbefa" + "sha256:362fd8f5eaebcfcd922cb9fd8274fb375df550319f78031ee3779eac0b9ecc79", + "sha256:8204126e01a00eaa4a746e7a0076538ca0e4e3f52408adec0ab9b471bb0bb64b" ], "markers": "python_version >= '3.8'", - "version": "==0.29.2" + "version": "==0.30.0" }, "types-pytz": { "hashes": [ diff --git a/tests/conftest.py b/tests/conftest.py index ae8dcd3..d2ad0f4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,12 +12,13 @@ generate_sample_embeddings_for_run, generate_sample_records, ) -from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata +from timdex_dataset_api import TIMDEXDataset from timdex_dataset_api.dataset import TIMDEXDatasetConfig from timdex_dataset_api.embeddings import ( DatasetEmbedding, TIMDEXEmbeddings, ) +from timdex_dataset_api.metadata import TIMDEXDatasetMetadata from timdex_dataset_api.record import DatasetRecord @@ -230,10 +231,8 @@ def timdex_dataset_same_day_runs(tmp_path) -> TIMDEXDataset: @pytest.fixture(scope="module") def timdex_metadata(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata: """TIMDEXDatasetMetadata with static database file created.""" - metadata = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) - metadata.rebuild_dataset_metadata() - metadata.refresh() - return metadata + timdex_dataset_with_runs.metadata.rebuild_dataset_metadata() + return timdex_dataset_with_runs.metadata @pytest.fixture(scope="module") @@ -247,9 +246,9 @@ def timdex_dataset_with_runs_with_metadata( @pytest.fixture -def timdex_metadata_empty(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata: +def timdex_metadata_empty(timdex_dataset_empty) -> TIMDEXDatasetMetadata: """TIMDEXDatasetMetadata without static database file.""" - return TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + return timdex_dataset_empty.metadata @pytest.fixture @@ -271,7 +270,8 @@ def timdex_metadata_with_deltas( ) td.write(records) - return TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + # return fresh TIMDEXDataset's metadata + return TIMDEXDataset(timdex_dataset_with_runs.location).metadata @pytest.fixture @@ -286,12 +286,11 @@ def timdex_metadata_merged_deltas( # clone dataset with runs using new dataset location td = TIMDEXDataset(dataset_location, config=timdex_dataset_with_runs.config) - # clone metadata and merge append deltas - metadata = TIMDEXDatasetMetadata(td.location) - metadata.merge_append_deltas() - metadata.refresh() + # merge append deltas via the TD's metadata + td.metadata.merge_append_deltas() + td.refresh() - return metadata + return td.metadata # ================================================================================ diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py index 09a22c4..fef7fc5 100644 --- a/tests/test_embeddings.py +++ b/tests/test_embeddings.py @@ -152,12 +152,13 @@ def test_embeddings_read_batches_yields_pyarrow_record_batches( timdex_dataset_empty.metadata.rebuild_dataset_metadata() timdex_dataset_empty.refresh() - # write embeddings - timdex_embeddings = TIMDEXEmbeddings(timdex_dataset_empty) - timdex_embeddings.write(sample_embeddings_generator(100, run_id="test-run")) - timdex_embeddings = TIMDEXEmbeddings(timdex_dataset_empty) + # write embeddings and refresh to pick up new views + timdex_dataset_empty.embeddings.write( + sample_embeddings_generator(100, run_id="test-run") + ) + timdex_dataset_empty.refresh() - batches = timdex_embeddings.read_batches_iter() + batches = timdex_dataset_empty.embeddings.read_batches_iter() batch = next(batches) assert isinstance(batch, pa.RecordBatch) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 3674f63..af94193 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -6,7 +6,7 @@ from duckdb import DuckDBPyConnection -from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata +from timdex_dataset_api import TIMDEXDataset ORDERED_METADATA_COLUMN_NAMES = [ "timdex_record_id", @@ -21,29 +21,33 @@ ] -def test_tdm_init_no_metadata_file_warning_success(caplog, timdex_dataset_with_runs): - TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) - +def test_tdm_init_no_metadata_file_warning_success(caplog, tmp_path): + # creating a new TIMDEXDataset will log warning if no metadata file + caplog.set_level("WARNING") + TIMDEXDataset(str(tmp_path / "new_empty_dataset")) assert "Static metadata database not found" in caplog.text def test_tdm_local_dataset_structure_properties(tmp_path): local_root = str(Path(tmp_path) / "path/to/nothing") - tdm_local = TIMDEXDatasetMetadata(local_root) - assert tdm_local.location == local_root - assert tdm_local.location_scheme == "file" + td_local = TIMDEXDataset(local_root) + assert td_local.metadata.location == local_root + assert td_local.metadata.location_scheme == "file" -def test_tdm_s3_dataset_structure_properties(s3_bucket_mocked): - s3_root = "s3://timdex/dataset" - tdm_s3 = TIMDEXDatasetMetadata(s3_root) - assert tdm_s3.location == s3_root - assert tdm_s3.location_scheme == "s3" +def test_tdm_s3_dataset_structure_properties(timdex_dataset_empty): + # test that location_scheme property works correctly for local paths + # S3 tests require full mocking and are covered in other tests + assert timdex_dataset_empty.metadata.location_scheme == "file" -def test_tdm_create_metadata_database_file_success(caplog, timdex_metadata_empty): +def test_tdm_create_metadata_database_file_success( + caplog, timdex_dataset_with_runs, timdex_metadata_empty +): caplog.set_level("DEBUG") - timdex_metadata_empty.rebuild_dataset_metadata() + # use a fresh dataset from timdex_dataset_with_runs location + td = TIMDEXDataset(timdex_dataset_with_runs.location) + td.metadata.rebuild_dataset_metadata() def test_tdm_init_metadata_file_found_success(timdex_metadata): @@ -321,15 +325,15 @@ def test_tdm_merge_append_deltas_deletes_append_deltas( assert not os.listdir(timdex_metadata_merged_deltas.append_deltas_path) -def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid( +def test_td_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid( monkeypatch, tmp_path_factory, timdex_dataset_with_runs ): preset_home = tmp_path_factory.mktemp("my-account") monkeypatch.setenv("HOME", str(preset_home)) - tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + td = TIMDEXDataset(timdex_dataset_with_runs.location) df = ( - tdm.conn.query( + td.conn.query( """ select current_setting('secret_directory') as secret_directory, @@ -344,15 +348,15 @@ def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid( assert df.extension_directory == "" # expected and okay when HOME set -def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_unset( +def test_td_prepare_duckdb_secret_and_extensions_home_env_var_unset( monkeypatch, timdex_dataset_with_runs ): monkeypatch.delenv("HOME", raising=False) - tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + td = TIMDEXDataset(timdex_dataset_with_runs.location) df = ( - tdm.conn.query( + td.conn.query( """ select current_setting('secret_directory') as secret_directory, @@ -367,15 +371,15 @@ def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_unset( assert df.extension_directory == "/tmp/.duckdb/extensions" -def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty( +def test_td_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty( monkeypatch, timdex_dataset_with_runs ): monkeypatch.setenv("HOME", "") # simulate AWS Lambda environment - tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + td = TIMDEXDataset(timdex_dataset_with_runs.location) df = ( - tdm.conn.query( + td.conn.query( """ select current_setting('secret_directory') as secret_directory, @@ -390,14 +394,16 @@ def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty( assert df.extension_directory == "/tmp/.duckdb/extensions" -def test_tdm_preload_current_records_default_false(tmp_path): - tdm = TIMDEXDatasetMetadata(str(tmp_path)) - assert tdm.preload_current_records is False +def test_td_preload_current_records_default_false(tmp_path): + td = TIMDEXDataset(str(tmp_path)) + assert td.preload_current_records is False + assert td.metadata.preload_current_records is False -def test_tdm_preload_current_records_flag_true(tmp_path): - tdm = TIMDEXDatasetMetadata(str(tmp_path), preload_current_records=True) - assert tdm.preload_current_records is True +def test_td_preload_current_records_flag_true(tmp_path): + td = TIMDEXDataset(str(tmp_path), preload_current_records=True) + assert td.preload_current_records is True + assert td.metadata.preload_current_records is True def test_tdm_preload_false_no_temp_table(timdex_dataset_with_runs): diff --git a/tests/test_read.py b/tests/test_read.py index 89a5ce2..9fb8c0c 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -255,7 +255,6 @@ def test_dataset_load_current_records_gets_correct_same_day_full_run( ): # ensure metadata exists for this dataset timdex_dataset_same_day_runs.metadata.rebuild_dataset_metadata() - timdex_dataset_same_day_runs.metadata.refresh() df = timdex_dataset_same_day_runs.read_dataframe( table="current_records", run_type="full" ) @@ -266,7 +265,6 @@ def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering( timdex_dataset_same_day_runs, ): timdex_dataset_same_day_runs.metadata.rebuild_dataset_metadata() - timdex_dataset_same_day_runs.metadata.refresh() first_record = next( timdex_dataset_same_day_runs.read_dicts_iter( table="current_records", run_type="daily" diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py index e713149..fb35f27 100644 --- a/timdex_dataset_api/__init__.py +++ b/timdex_dataset_api/__init__.py @@ -5,7 +5,7 @@ from timdex_dataset_api.metadata import TIMDEXDatasetMetadata from timdex_dataset_api.record import DatasetRecord -__version__ = "3.8.0" +__version__ = "3.9.0" __all__ = [ "DatasetEmbedding", diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py index 61df087..71f17df 100644 --- a/timdex_dataset_api/dataset.py +++ b/timdex_dataset_api/dataset.py @@ -16,12 +16,15 @@ import pandas as pd import pyarrow as pa import pyarrow.dataset as ds -from duckdb import DuckDBPyConnection +from duckdb_engine import ConnectionWrapper from pyarrow import fs +from sqlalchemy import MetaData, Table, create_engine +from sqlalchemy.types import ARRAY, FLOAT from timdex_dataset_api.config import configure_logger from timdex_dataset_api.embeddings import TIMDEXEmbeddings from timdex_dataset_api.metadata import TIMDEXDatasetMetadata +from timdex_dataset_api.utils import DuckDBConnectionFactory if TYPE_CHECKING: from timdex_dataset_api.record import DatasetRecord # pragma: nocover @@ -78,6 +81,10 @@ class TIMDEXDatasetConfig: from a dataset; pyarrow default is 16 - fragment_read_ahead: number of fragments to optimistically read ahead when batch reaching from a dataset; pyarrow default is 4 + - duckdb_join_batch_size: batch size for keyset pagination when joining metadata + + Note: DuckDB connection settings (memory_limit, threads) are handled by + DuckDBConnectionFactory via TDA_DUCKDB_MEMORY_LIMIT and TDA_DUCKDB_THREADS env vars. """ read_batch_size: int = field( @@ -132,18 +139,21 @@ def __init__( self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS self.dataset = self.load_pyarrow_dataset() - # dataset metadata - self.metadata = TIMDEXDatasetMetadata( - location, - preload_current_records=preload_current_records, - ) + # create DuckDB connection used by all classes + self.conn_factory = DuckDBConnectionFactory(location_scheme=self.location_scheme) + self.conn = self.conn_factory.create_connection() - # DuckDB context - self.conn = self.setup_duckdb_context() + # create schemas + self._create_duckdb_schemas() - # dataset embeddings + # composed components receive self + self.metadata = TIMDEXDatasetMetadata(self) self.embeddings = TIMDEXEmbeddings(self) + # SQLAlchemy (SA) reflection after components have set up their views + self.sa_tables: dict[str, dict[str, Table]] = {} + self.reflect_sa_tables() + @property def location_scheme(self) -> Literal["file", "s3"]: scheme = urlparse(self.location).scheme @@ -158,7 +168,7 @@ def data_records_root(self) -> str: return f"{self.location.removesuffix('/')}/data/records" # type: ignore[union-attr] def refresh(self) -> None: - """Fully reload TIMDEXDataset instance.""" + """Refresh dataset by fully reinitializing.""" self.__init__( # type: ignore[misc] self.location, config=self.config, @@ -245,24 +255,54 @@ def get_s3_filesystem() -> fs.FileSystem: session_token=credentials.token, ) - def setup_duckdb_context(self) -> DuckDBPyConnection: - """Create a DuckDB connection that metadata and data query and retrieval. + def _create_duckdb_schemas(self) -> None: + """Create DuckDB schemas used by all components.""" + self.conn.execute("create schema metadata;") + self.conn.execute("create schema data;") - This method extends TIMDEXDatasetMetadata's pre-existing DuckDB connection, adding - a 'data' schema and any other configurations needed. + def reflect_sa_tables(self, schemas: list[str] | None = None) -> None: + """Reflect SQLAlchemy metadata for DuckDB schemas. + + This centralizes SA reflection for all composed components. Reflected tables + are stored in self.sa_tables as {schema: {table_name: Table}}. + + Args: + schemas: list of schemas to reflect; defaults to ["metadata", "data"] """ start_time = time.perf_counter() + schemas = schemas or ["metadata", "data"] - conn = self.metadata.conn + engine = create_engine( + "duckdb://", + creator=lambda: ConnectionWrapper(self.conn), + ) + + for schema in schemas: + db_metadata = MetaData() + db_metadata.reflect(bind=engine, schema=schema, views=True) + + # store tables in flat dict keyed by table name (without schema prefix) + self.sa_tables[schema] = { + table_name.removeprefix(f"{schema}."): table + for table_name, table in db_metadata.tables.items() + } - # create data schema - conn.execute("""create schema data;""") + # type fixup for embedding_vector column (DuckDB LIST -> SA ARRAY) + if "embeddings" in self.sa_tables.get("data", {}): + self.sa_tables["data"]["embeddings"].c.embedding_vector.type = ARRAY(FLOAT) logger.debug( - "DuckDB context created for TIMDEXDataset, " - f"{round(time.perf_counter()-start_time,2)}s" + f"SQLAlchemy reflection complete for schemas {schemas}, " + f"{round(time.perf_counter() - start_time, 3)}s" ) - return conn + + def get_sa_table(self, schema: str, table: str) -> Table: + """Get a reflected SQLAlchemy Table by schema and table name.""" + if schema not in self.sa_tables: + raise ValueError(f"Schema '{schema}' not found in reflected SA tables.") + if table not in self.sa_tables[schema]: + raise ValueError(f"Table '{table}' not found in schema '{schema}'.") + return self.sa_tables[schema][table] def write( self, @@ -326,7 +366,7 @@ def write( if write_append_deltas: for written_file in written_files: self.metadata.write_append_delta_duckdb(written_file.path) # type: ignore[attr-defined] - self.metadata.refresh() + self.refresh() self.log_write_statistics(start_time, written_files) @@ -575,9 +615,7 @@ def _iter_data_chunks(self, data_query: str) -> Iterator[pa.RecordBatch]: ) finally: if self.location_scheme == "s3": - self.conn.execute( - f"""set threads={self.metadata.config.duckdb_connection_threads};""" - ) + self.conn.execute(f"""set threads={self.conn_factory.threads};""") def read_dataframes_iter( self, diff --git a/timdex_dataset_api/embeddings.py b/timdex_dataset_api/embeddings.py index 00655ff..92a1465 100644 --- a/timdex_dataset_api/embeddings.py +++ b/timdex_dataset_api/embeddings.py @@ -14,11 +14,10 @@ from duckdb import DuckDBPyConnection from duckdb import IOException as DuckDBIOException from duckdb_engine import Dialect as DuckDBDialect -from sqlalchemy import Table, and_, select, text -from sqlalchemy.types import ARRAY, FLOAT +from sqlalchemy import and_, select, text from timdex_dataset_api.record import datetime_iso_parse -from timdex_dataset_api.utils import build_filter_expr_sa, sa_reflect_duckdb_conn +from timdex_dataset_api.utils import build_filter_expr_sa if TYPE_CHECKING: from timdex_dataset_api import TIMDEXDataset @@ -148,56 +147,35 @@ def __init__(self, timdex_dataset: "TIMDEXDataset"): - timdex_dataset: instance of TIMDEXDataset """ self.timdex_dataset = timdex_dataset + self.conn = timdex_dataset.conn self.schema = TIMDEX_DATASET_EMBEDDINGS_SCHEMA self.partition_columns = ["year", "month", "day"] - # DuckDB context - self.conn = self.setup_duckdb_context() - self._sa_metadata_data_schema = sa_reflect_duckdb_conn(self.conn, schema="data") - - # resolve data type for 'embedding_vector' column - if "data.embeddings" in self._sa_metadata_data_schema.tables: - sa_metadata_data_embeddings_table = self._sa_metadata_data_schema.tables[ - "data.embeddings" - ] - sa_metadata_data_embeddings_table.c.embedding_vector.type = ARRAY(FLOAT) + # set up embeddings views + self._setup_embeddings_views() @property def data_embeddings_root(self) -> str: return f"{self.timdex_dataset.location.removesuffix('/')}/data/embeddings" - def get_sa_table(self, table: str) -> Table: - """Get SQLAlchemy Table from reflected SQLAlchemy metadata.""" - schema_table = f"data.{table}" - if schema_table not in self._sa_metadata_data_schema.tables: - raise ValueError(f"Could not find table '{table}' in DuckDB schema 'data'.") - return self._sa_metadata_data_schema.tables[schema_table] - - def setup_duckdb_context(self) -> DuckDBPyConnection: - """Create a DuckDB connection for embeddings query and retrieval. - - This method extends TIMDEXDatasetMetadata's pre-existing DuckDB connection - (via the attached TIMDEXDataset), creating views in the 'data' schema. - """ + def _setup_embeddings_views(self) -> None: + """Set up embeddings views in the 'data' schema.""" start_time = time.perf_counter() - conn = self.timdex_dataset.conn - try: - self._create_embeddings_view(conn) - self._create_current_embeddings_view(conn) - self._create_current_run_embeddings_view(conn) + self._create_embeddings_view(self.conn) + self._create_current_embeddings_view(self.conn) + self._create_current_run_embeddings_view(self.conn) except DuckDBIOException: - logger.warning("No embeddings found") + logger.debug("No embeddings parquet files found") except Exception as exception: # noqa: BLE001 - logger.warning(f"An error occurred while creating views: {exception}") + logger.warning(f"Error creating embeddings views: {exception}") logger.debug( - "DuckDB context created for TIMDEXEmbeddings, " + "Embeddings views setup for TIMDEXEmbeddings, " f"{round(time.perf_counter()-start_time,2)}s" ) - return conn def _create_embeddings_view(self, conn: DuckDBPyConnection) -> None: """Create a view that projects over embeddings parquet files.""" @@ -408,8 +386,8 @@ def _build_query( fetch results. Always joins to metadata.records to enable filtering by metadata columns (source, run_date, run_type, action, run_timestamp). """ - embeddings_table = self.get_sa_table(table) - metadata_table = self.timdex_dataset.metadata.get_sa_table("records") + embeddings_table = self.timdex_dataset.get_sa_table("data", table) + metadata_table = self.timdex_dataset.get_sa_table("metadata", "records") # select specific columns or default to all from embeddings + metadata if columns: diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py index ca0317a..37a20d8 100644 --- a/timdex_dataset_api/metadata.py +++ b/timdex_dataset_api/metadata.py @@ -4,25 +4,22 @@ import shutil import tempfile import time -from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Literal, Unpack, cast -from urllib.parse import urlparse -import duckdb from duckdb import DuckDBPyConnection from duckdb_engine import Dialect as DuckDBDialect -from sqlalchemy import Table, func, literal, select, text, tuple_ +from sqlalchemy import func, literal, select, text, tuple_ from timdex_dataset_api.config import configure_logger from timdex_dataset_api.utils import ( + DuckDBConnectionFactory, S3Client, build_filter_expr_sa, - sa_reflect_duckdb_conn, ) if TYPE_CHECKING: - from timdex_dataset_api.dataset import DatasetFilters + from timdex_dataset_api.dataset import DatasetFilters, TIMDEXDataset logger = configure_logger(__name__) @@ -39,54 +36,35 @@ ] -@dataclass -class TIMDEXDatasetMetadataConfig: - """Configurations for metadata operations. - - - duckdb_connection_memory_limit: Memory limit for DuckDB connection - - duckdb_connection_threads: Thread limit for DuckDB connection - """ - - duckdb_connection_memory_limit: str = field( - default_factory=lambda: os.getenv("TDA_DUCKDB_MEMORY_LIMIT", "4GB") - ) - duckdb_connection_threads: int = field( - default_factory=lambda: int(os.getenv("TDA_DUCKDB_THREADS", "8")) - ) - - class TIMDEXDatasetMetadata: - def __init__( - self, - location: str, - *, - preload_current_records: bool = False, - ) -> None: + def __init__(self, timdex_dataset: "TIMDEXDataset") -> None: """Init TIMDEXDatasetMetadata. Args: - location: root location of TIMDEX dataset, e.g. 's3://timdex/dataset' - preload_current_records: if True, create in-memory temp table for - current_records (faster for repeated queries); if False, create view only - (default, lower memory) + timdex_dataset: parent TIMDEXDataset instance """ - self.location = location - self.config = TIMDEXDatasetMetadataConfig() - self.preload_current_records = preload_current_records + self.timdex_dataset = timdex_dataset + self.conn = timdex_dataset.conn self.create_metadata_structure() - self.conn: DuckDBPyConnection = self.setup_duckdb_context() - self._sa_metadata = sa_reflect_duckdb_conn(self.conn, schema="metadata") + self._setup_metadata_schema() + + @property + def location(self) -> str: + return self.timdex_dataset.location @property def location_scheme(self) -> Literal["file", "s3"]: - scheme = urlparse(self.location).scheme - if scheme == "": - return "file" - if scheme == "s3": - return "s3" - raise ValueError(f"Location with scheme type '{scheme}' not supported.") + return self.timdex_dataset.location_scheme + + @property + def config(self) -> "TIMDEXDataset.config": # type: ignore[name-defined] + return self.timdex_dataset.config + + @property + def preload_current_records(self) -> bool: + return self.timdex_dataset.preload_current_records @property def metadata_root(self) -> str: @@ -138,7 +116,7 @@ def append_deltas_count(self) -> int: ] # type: ignore[index] def create_metadata_structure(self) -> None: - """Ensure metadata structure exists in TIDMEX dataset..""" + """Ensure metadata structure exists in TIMDEX dataset.""" if self.location_scheme == "file": Path(self.metadata_database_path).parent.mkdir( parents=True, @@ -149,91 +127,6 @@ def create_metadata_structure(self) -> None: exist_ok=True, ) - def configure_duckdb_connection(self, conn: DuckDBPyConnection) -> None: - """Configure a DuckDB connection/context. - - These configurations include things like memory settings, AWS authentication, etc. - """ - self._install_duckdb_extensions(conn) - self._configure_duckdb_s3_secret(conn) - self._configure_duckdb_memory_profile(conn) - - def _install_duckdb_extensions(self, conn: DuckDBPyConnection) -> None: - """Ensure DuckDB capable of installing extensions and install any required.""" - # ensure secrets and extensions paths are accessible - home_env = os.getenv("HOME") - use_fallback_home = not home_env or not Path(home_env).is_dir() - - if use_fallback_home: - duckdb_home = Path("/tmp/.duckdb") # noqa: S108 - secrets_dir = duckdb_home / "secrets" - extensions_dir = duckdb_home / "extensions" - - secrets_dir.mkdir(parents=True, exist_ok=True) - extensions_dir.mkdir(parents=True, exist_ok=True) - - conn.execute(f"set secret_directory='{secrets_dir.as_posix()}';") - conn.execute(f"set extension_directory='{extensions_dir.as_posix()}';") - - # install HTTPFS extension - conn.execute( - """ - install httpfs; - load httpfs; - """ - ) - - def _configure_duckdb_s3_secret( - self, - conn: DuckDBPyConnection, - scope: str | None = None, - ) -> None: - """Configure a secret in a DuckDB connection for S3 access. - - If a scope is provided, e.g. an S3 URI prefix like 's3://timdex', set a scope - parameter in the config. Else, leave it blank. - """ - # establish scope string - scope_str = f", scope '{scope}'" if scope else "" - - if os.getenv("MINIO_S3_ENDPOINT_URL"): - conn.execute( - f""" - create or replace secret minio_s3_secret ( - type s3, - endpoint '{urlparse(os.environ["MINIO_S3_ENDPOINT_URL"]).netloc}', - key_id '{os.environ["MINIO_USERNAME"]}', - secret '{os.environ["MINIO_PASSWORD"]}', - region 'us-east-1', - url_style 'path', - use_ssl false - {scope_str} - ); - """ - ) - - elif self.location_scheme == "s3": - conn.execute( - f""" - create or replace secret aws_s3_secret ( - type s3, - provider credential_chain, - refresh true - {scope_str} - ); - """ - ) - - def _configure_duckdb_memory_profile(self, conn: DuckDBPyConnection) -> None: - conn.execute( - f""" - set enable_external_file_cache = false; - set memory_limit = '{self.config.duckdb_connection_memory_limit}'; - set threads = {self.config.duckdb_connection_threads}; - set preserve_insertion_order=false; - """ - ) - def database_exists(self) -> bool: """Check if static metadata database file exists.""" if self.location_scheme == "s3": @@ -241,20 +134,6 @@ def database_exists(self) -> bool: return s3_client.object_exists(self.metadata_database_path) return os.path.exists(self.metadata_database_path) - def get_sa_table(self, table: str) -> Table: - """Get SQLAlchemy Table from reflected SQLAlchemy metadata.""" - schema_table = f"metadata.{table}" - if schema_table not in self._sa_metadata.tables: - raise ValueError( - f"Could not find table '{table}' in DuckDB schema 'metadata'." - ) - return self._sa_metadata.tables[schema_table] - - def refresh(self) -> None: - """Refresh DuckDB connection and reflected SQLAlchemy metadata on self.""" - self.conn = self.setup_duckdb_context() - self._sa_metadata = sa_reflect_duckdb_conn(self.conn, schema="metadata") - def rebuild_dataset_metadata(self) -> None: """Fully rebuild dataset metadata. @@ -274,9 +153,8 @@ def rebuild_dataset_metadata(self) -> None: with tempfile.TemporaryDirectory() as temp_dir: local_db_path = str(Path(temp_dir) / self.metadata_database_filename) - with duckdb.connect(local_db_path) as conn: - self.configure_duckdb_connection(conn) - + factory = DuckDBConnectionFactory(location_scheme=self.location_scheme) + with factory.create_connection(local_db_path) as conn: self._create_full_dataset_table(conn) # copy local database file to remote location @@ -289,8 +167,8 @@ def rebuild_dataset_metadata(self) -> None: else: shutil.copy(local_db_path, self.metadata_database_path) - # refresh DuckDB connection - self.conn = self.setup_duckdb_context() + # refresh dataset to pick up new metadata + self.timdex_dataset.refresh() def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None: """Create a table of metadata for all records in the ETL parquet dataset. @@ -319,7 +197,7 @@ def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None: conn.execute(query) # reset thread count - conn.execute(f"""SET threads = {self.config.duckdb_connection_threads};""") + conn.execute(f"""SET threads = {self.timdex_dataset.conn_factory.threads};""") row_count = conn.query("""select count(*) from records;""").fetchone()[0] # type: ignore[index] logger.info( @@ -327,45 +205,30 @@ def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None: f"elapsed: {time.perf_counter() - start_time}" ) - def setup_duckdb_context(self) -> DuckDBPyConnection: - """Create a DuckDB connection that provides full dataset metadata information. - - The following work is performed: - 1. Attach to static metadata database file. - 2. Create views that union static metadata with any append deltas. - 3. Create additional metadata views as needed. - - The resulting, in-memory DuckDB connection is used for all metadata queries. + def _setup_metadata_schema(self) -> None: + """Set up metadata schema views in the DuckDB connection. - If a static database file is not found, a configured DuckDB connection is still - returned. + Creates views for accessing static metadata DB and append deltas. + If static DB doesn't exist, logs warning but doesn't fail. """ start_time = time.perf_counter() - conn = duckdb.connect() - conn.execute("""SET enable_progress_bar = false;""") - self.configure_duckdb_connection(conn) - if not self.database_exists(): logger.warning( f"Static metadata database not found @ '{self.metadata_database_path}'. " - "Please recreate via TIMDEXDatasetMetadata.recreate_database_file()." + "Consider rebuild via TIMDEXDataset.metadata.rebuild_dataset_metadata()." ) - return conn - - # create metadata schema - conn.execute("create schema metadata;") + return - self._attach_database_file(conn) - self._create_append_deltas_view(conn) - self._create_records_union_view(conn) - self._create_current_records_view(conn) + self._attach_database_file(self.conn) + self._create_append_deltas_view(self.conn) + self._create_records_union_view(self.conn) + self._create_current_records_view(self.conn) logger.debug( - "DuckDB context created for TIMDEXDatasetMetadata, " + "Metadata schema setup for TIMDEXDatasetMetadata, " f"{round(time.perf_counter()-start_time,2)}s" ) - return conn def _attach_database_file(self, conn: DuckDBPyConnection) -> None: """Readonly attach to static metadata database. @@ -649,7 +512,7 @@ def build_keyset_paginated_metadata_query( **filters: Unpack["DatasetFilters"], ) -> str: """Build SQL query using SQLAlchemy against metadata schema tables and views.""" - sa_table = self.get_sa_table(table) + sa_table = self.timdex_dataset.get_sa_table("metadata", table) # create SQL statement object stmt = select( diff --git a/timdex_dataset_api/utils.py b/timdex_dataset_api/utils.py index 4a9ba08..6fea970 100644 --- a/timdex_dataset_api/utils.py +++ b/timdex_dataset_api/utils.py @@ -5,11 +5,13 @@ import pathlib import time from datetime import UTC, date, datetime -from typing import TYPE_CHECKING, Any +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal from urllib.parse import urlparse import boto3 -from duckdb import DuckDBPyConnection # type: ignore[import-untyped] +import duckdb +from duckdb import DuckDBPyConnection from duckdb_engine import ConnectionWrapper from sqlalchemy import ( MetaData, @@ -106,6 +108,109 @@ def _split_s3_uri(s3_uri: str) -> tuple[str, str]: return bucket, key +class DuckDBConnectionFactory: + """Factory for creating and configuring DuckDB connections. + + Args: + location_scheme: "file" or "s3", determines S3 credential setup + memory_limit: DuckDB memory limit (env: TDA_DUCKDB_MEMORY_LIMIT, default: "4GB") + threads: DuckDB thread limit (env: TDA_DUCKDB_THREADS, default: 8) + """ + + def __init__( + self, + location_scheme: Literal["file", "s3"] = "file", + memory_limit: str | None = None, + threads: int | None = None, + ): + self.location_scheme = location_scheme + self.memory_limit = memory_limit or os.getenv("TDA_DUCKDB_MEMORY_LIMIT", "4GB") + self.threads = threads or int(os.getenv("TDA_DUCKDB_THREADS", "8")) + + def create_connection(self, path: str = ":memory:") -> DuckDBPyConnection: + """Create a new configured DuckDB connection. + + Args: + path: Database file path or ":memory:" for in-memory database (default) + """ + start_time = time.perf_counter() + conn = duckdb.connect(path) + conn.execute("SET enable_progress_bar = false;") + self.configure_connection(conn) + logger.debug( + f"DuckDB connection created, {round(time.perf_counter()-start_time,2)}s" + ) + return conn + + def configure_connection(self, conn: DuckDBPyConnection) -> None: + """Configure an existing DuckDB connection.""" + self._install_extensions(conn) + self._configure_s3_secret(conn) + self._configure_memory_profile(conn) + + def _install_extensions(self, conn: DuckDBPyConnection) -> None: + """Ensure DuckDB capable of installing extensions and install any required.""" + home_env = os.getenv("HOME") + use_fallback_home = not home_env or not Path(home_env).is_dir() + + if use_fallback_home: + duckdb_home = Path("/tmp/.duckdb") # noqa: S108 + secrets_dir = duckdb_home / "secrets" + extensions_dir = duckdb_home / "extensions" + + secrets_dir.mkdir(parents=True, exist_ok=True) + extensions_dir.mkdir(parents=True, exist_ok=True) + + conn.execute(f"set secret_directory='{secrets_dir.as_posix()}';") + conn.execute(f"set extension_directory='{extensions_dir.as_posix()}';") + + conn.execute( + """ + install httpfs; + load httpfs; + """ + ) + + def _configure_s3_secret(self, conn: DuckDBPyConnection) -> None: + """Configure a secret in a DuckDB connection for S3 access.""" + if os.getenv("MINIO_S3_ENDPOINT_URL"): + conn.execute( + f""" + create or replace secret minio_s3_secret ( + type s3, + endpoint '{urlparse(os.environ["MINIO_S3_ENDPOINT_URL"]).netloc}', + key_id '{os.environ["MINIO_USERNAME"]}', + secret '{os.environ["MINIO_PASSWORD"]}', + region 'us-east-1', + url_style 'path', + use_ssl false + ); + """ + ) + + elif self.location_scheme == "s3": + conn.execute( + """ + create or replace secret aws_s3_secret ( + type s3, + provider credential_chain, + refresh true + ); + """ + ) + + def _configure_memory_profile(self, conn: DuckDBPyConnection) -> None: + """Configure DuckDB memory and thread settings.""" + conn.execute( + f""" + set enable_external_file_cache = false; + set memory_limit = '{self.memory_limit}'; + set threads = {self.threads}; + set preserve_insertion_order=false; + """ + ) + + def sa_reflect_duckdb_conn( conn: DuckDBPyConnection, schema: str | None = None ) -> MetaData: