diff --git a/.gitignore b/.gitignore index a5f5e47..d86889b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ /data/ *.shelve /__pycache__/ +/test/ +merged* diff --git a/docs.weight b/docs.weight new file mode 100644 index 0000000..36d6d84 --- /dev/null +++ b/docs.weight @@ -0,0 +1 @@ +{"97ec17cf1e572a9f25e55b7dc0718b979dd5c9bfe6c3eb78ea20c88c0c68a4e1": 23.211429282062095, "c4998ef5b34b39a6f0c6eebfc669df8c50b18eb2e9f8f666469d4a2bb18bfa86": 89.50211096543877, "d7cf638d4e0c1ebf34497b5fad6da8a3c139d8b14e00318e02ce423cb84f77b1": 31.879096611326435, "ca50d33113f673f3330b8b92e75d4ca9de13f04aab24945d02f8b29f17c4e678": 56.49414928645974, "eb5179d67c6b39c45b199fc9ccd85ea27b8a7964a33140c5d4c976843bf1da56": 53.38400589146967, "ec35f050a4840c349594ca07d4bc7e08e1da4330ba4c4047994062f615c9199e": 74.43418858109507, "d3b84ad872772db4d250c2e11f2314dc8c93ed45abe90c0049e065db6750bd19": 74.8493943282779, "aea1caee5eee2184127f8867c6e3bd4929be507998fe41504d6a5c173c472a66": 52.58783923329481, "66327cc3d1373919295657333711ef812e9a0b266a0389faa5d1bc8d6cdaf18b": 1.0, "6d24a375c0550a2af59cc5e84cb61dbd8f55147413c4eb5de7128b5087221daf": 2.770364865278749, "954f9f83b64479d3840a9b94bb9e713e5c3fc2deaeb435c1f7518db53b4ec848": 35.03521931290655, "92677d772ecb2382b12e039448f04d8b8722afc72a39cb7824c4d8a99b0f9d72": 48.17528450233468, "812072c281f4ed88fcc5489bc6f3cdf86617d815da1e948f81629f2671fbcdbf": 23.264397762153813, "8df10a245cf0a60252c8d6f8ac90b2c8aecc71fb220018660a7f5c330770f046": 27.463739029773105, "3e5cdc7855f06dae6b087779f17593df2e1d2e606a339d54f678701c4e0f4779": 23.1304793132926, "3dd1dcb28df84b82aba2da76227b1af16267f97da24448a30c84eb318e82d84d": 70.60047968232763, "345e4bd69edf6a2f141c8c21def641be10f14fecfdd73ab049b6f99c522b3b13": 23.18232479904254, "2a682642ab15d837cadceafb06426b1729bcfe1beb7f61e040a79c0b49ec3167": 60.84419656769506, "6b1145655efa1130092ab67c7c4630718b1b9d82630c12bb04706a7074ac9758": 41.149047021663556, "1efc6180dc3995d65127a94a74265604aeb1d21e8e803ba86f35be473004e2c2": 23.20164812870424, "067c7f73631864772190487734b60867cbc9340c93b6d1821de7306d25826029": 27.463739029773105, "bc34879b7ee883c639b2348de2ae91a5fa7ceed4e4320cce6af9e27109e2aceb": 91.38355259829086, "920dbd560e078232b99223bbc3435c0e7fe6d7660e0deb17d0b2bf17730b892c": 38.170474162853544, "f4dfc915a1b3b2f71298e31b6eac389d46ee96ad09963ff69a00cf40f8c04c1a": 5.263057138323923, "fb0b1a03e2e8b86fdae34d394caa6b8867bce92c4f25ee0c5e7e1b48a63eab7d": 6.719686371912561, "f66600277c08486cdf28796b401c13123215233cdb5fa7e5352089bf9f95f51a": 9.089140561068787, "efb4ec871502e4e5655bcb180bf3057c612c2fdbd21eca4c0d222a1c550b7aa9": 5.756114415755812, "f08602043c0ebe317c321a8de28088d9e690a64b3b2e81f63de4cb8fde1e67a5": 5.263057138323923, "ed50028467b187111b56d5502ce0236b9baeca01aa6d2c8a7971e54afa077f0a": 6.719686371912561, "ed2b06ebd15043a165b1cf638e6b502e220a425a7c12d822d79c8295eba82887": 6.719686371912561, "e97f4a3e8d6b394d68d97833e571367e2879de721a1fefe3b6c197cc27c95efd": 5.263057138323923, "dc13f30863ad788c9fd0aa46140d496ac581caf15c72996e5501df51c234efce": 5.756114415755812, "db6d3214eca641297641e7f3e9cdb2035b705b11c39358ad689ecfb35b0c3618": 5.756114415755812, "d66e5959ee0d3623e5ad68d667dcea348d59459ca888454a853386cb11fdf887": 5.263057138323923, "d27892f95a6313aa6325879944fc86a9e0a1e35f20f50f1886c20dbbdbfba049": 5.756114415755812, "d0419a9eb63f826735dde37bcb6d903776ddb23b5138f69b38bb14e62f317dbd": 5.756114415755812, "cd4f0b57fd5dfe0fcedab330c3535886b01ded97d9c39203ea268a4cc2e92131": 5.263057138323923, "be6aa991e82776c34211d704705d8a922d945bcdd114e7a03bc8d3a4df1c26be": 5.756114415755812, "b82fc67d16fd18c552ed7ac58c7439c6fe3c83e846c1e393803c77a0e6718306": 5.756114415755812, "b34fbdf1b66a787550fd8666edacb3ead90a8965f621616bfe888386547112f4": 5.263057138323923, "b091890236ac1db422a1b37ad0ccd2d08bbcfeb0b18657cc536845f9bc703861": 5.756114415755812, "87970495632346d3de1849853d86b2a65f33314b63c4a03cc6fd0e5686777f7d": 64.13926539812697, "ad5a799c1ead886776a4ed6b8a0a268f59a33473ad5993f20907947e2951eab1": 6.719686371912561, "bdfe7f4d71744b379ecd07757e59a2aebd5acd1e1cebcbbfb4b1174bedafa3f0": 68.5635065560256, "ac53d1654801defe45c3bbac3bef5b87cd7c2fa436f8fd5e87c14c3e8ba8b41c": 5.263057138323923, "ada143ad7a352e06cc6cfab6fd921cf5a89312bd6ad0505230678fc2f444d644": 9.089140561068787, "a785de5a0e59b49b43ed9b1e8fb05dcf372cc3ce713bc2121c1ff7de987c2575": 41.223494202262245, "a5de30a691de378f9dbda2068e4bf538fcdd331954259e88ab2173ee0c8b18f1": 87.57735879241608, "aaddaba74b5a3bedf20a47de502aab4bcd238574f2dfac7598ddd631ed302126": 5.756114415755812, "072196284a46eb0041e42d59f0ae6a5a45ac92c857a512ea79a0bfe1076d56df": 64.13926539812697, "a51dceb810fd51745cffc4200de0cd76ff6847dbfae81faadd82360f05f67329": 5.263057138323923, "1c1131431fd2a251600dd0548975ff7c55dc5549ef805553ceb4e0f90ee19350": 127.09375758856166, "9fb1e03fab83daf4c6bf2bf3a318288b9120fea064bdc59fd6882b47986b77cb": 5.756114415755812, "a67c1c3443e54f176be8e512bf6206949d6f3795fd29f255cee82f2dee8c1391": 9.089140561068787, "a869b3d816276a8906188053faf9cd3e850bdbaa8133b970e730c03501e18b87": 9.089140561068787, "99d8ca64d3159721da3b2de602680689325bb39374f2b839540b754abd13d017": 5.263057138323923, "98f3a435e689441f92dbb1987a00c394255d67d42ee5d27d775f51a1ae4ed91a": 5.263057138323923, "9b62bb57925b5cc7be4410637eb27adaef7294f2d70b1445270a4eeb2f3729f1": 5.756114415755812, "9ce51a8682ef8243690d4445fec9b3008b593ad958daddb8138098ffa37a0f6b": 9.089140561068787, "915824de6184b072c049418eb9748a4dea3a30ec62393a417c7d5bd6ea510398": 5.263057138323923, "8c040cfd76c356cfee2e8e1b1f6e8abcc07ec4e254fe2e250955858c0ac625f6": 5.756114415755812, "9f07aa7921ba582578986346e4f2a9e10f12becd4eb09ae11c2b9ecda1ed0a3f": 89.21410370699289, "9408b72b6f75aa5a2e92f171d075da7d5dfd55c13c2e3b8d50186f9cda2c5d3c": 9.089140561068787, "8ad65680673275011a28f1f2471c2a1f590f9ee628661cd70bd9c019835ff476": 5.756114415755812, "007f1dcab3731c49a22f8ff0d1a24ec0c26759650b01b2d5dbe3a787865e13ea": 127.09375758856166, "8b4c752470740d1eb4a8ac6d640e634e082514f8b31e2530d14e4fdba19ded83": 5.756114415755812, "8603f3778878b51412d4aaf8be2c020d1c9461c904bde898a01329cf24b19fd6": 5.756114415755812, "7d40306e883fedccd4f334dc487e7cf999df1827c8b54f889ddcb9df604096f0": 5.756114415755812, "82856a8de085bdb179967be59e3a041870057dae7bca8b72f4a6d3a67af45f9f": 5.263057138323923, "83bf23754962b76c2d5b3d75c0175603e2e554e626ad218a302f949ee1d41d31": 6.719686371912561, "8cf924a8684ebbb31a509f0359dcd92435131d814ed0ffe5f2b334dd72646d21": 54.11243394509106, "7ce107dc72ccb3a43ce3e13ba9f4a016cf238cd86fe3633e571b602cff4d2a0c": 5.263057138323923, "78a29a7a1a3c54862e957c63785587ec0c1df23e286343af8121fa651852b34d": 5.756114415755812, "75e7525f276a53795dd8dd68769283574f48c4b96618fd872fcd5881864adf51": 22.767294490117994, "76cf1a608e94b207d8be4634cea94e668b5e1733bb097e53937eeb2f71daae00": 88.060768544003, "7c531be8d2aed825a88c3dccdc2f9305e7cfa3e75a546c27b45ee09f2974e663": 9.089140561068787, "7112db4f2817ccdfaeb1fe50b66745dd8cc07a6b927942e5198e20f82061cc5d": 5.263057138323923, "7bd590ceb8a331e44e8fe8626e81cda024737b28177db34718bb13de6b2dbcda": 6.719686371912561, "68b2096ae0e570e375e507a9d84bd5449500025acbbdab1fb998dcd24f549d62": 5.263057138323923, "65481a03ecf7fe5b0b5f287277ef6c7eb58fb1003f1a2b6d58ebda80e55b2fa7": 5.263057138323923, "60a2c63a61cf1ee36f2cb4cc4efa33391ff6f96ac4981e85b8b51324f6a786b8": 5.756114415755812, "5c4253c18266df49ccfc8efb07f81d384a551e53f0b205200c04164a81d676bb": 5.756114415755812, "5c20ff95b1d44911d9734237bdd3b8780fadb276ee1c5f686757a2f29cd7047e": 5.263057138323923, "561c63f7b559d4198b6fbef36913a54057ba3e0997da9437c816798af84fcfad": 0.43395541890454786, "596391a3f230fc91a56d25942f21d3db3f5bbab1ce37d02e6f2b567db8148f15": 5.756114415755812, "527911d7a67ad292aaf95b7f9961a60365796dd3e157f039a190a0278cb2a687": 5.263057138323923, "52d20a59fb8876dcabf621db4ecd8447c8dfd998ca9722021befe6485c92589a": 5.263057138323923, "5bca7f7d6840826f06efd81a7501dfc3d2e369b9526247d7f1469088deac55ea": 9.089140561068787, "524d854881bde6182add1463f768c0b8f61fa731975900ca735010c4161b4cb3": 5.263057138323923, "5167395e400220ec40397e5bf23d985033349f8d6535c0640fe25cad83134295": 5.263057138323923, "4d97518cd57dc859b5458acae04006d8d9a03e98d60afe43128ce6b3bb55ba10": 5.756114415755812, "4fee6eab61bc52c80e9987beb387e4e69843d0318f7f981790318019567a3ff4": 33.50389023490836, "481526b3a42e368d1162bc39b3e5536b6dd944c2a65a2ee2a3ed94aa4f54a5c4": 5.263057138323923, "6c647534f5a28e31cb903c544fc38a8b1cc28b3ebd0ef08dd81ba35ef401732a": 121.94086402932783, "432a266b18aa96d66d7fea4182d10fa6b138c5c691dc6e5ef691361d64e92cde": 5.263057138323923, "3d830de9e15f955ce86b29357a540be7e8f1cd9e0b31e81da591f81444cdf5be": 5.756114415755812, "3d6fc8e4bcf549ef1a795030dde61cb2b5be9f470334c621228058938b1bc655": 5.756114415755812, "3abb68a0caf162a42a187f3c0c111c864e9d2878da402615808113421165e9cb": 5.263057138323923, "3b8d92045f2f2ab252bcc15e4a076bce8d8214ec218787c8f3ac861f4e8033c7": 5.756114415755812, "3b4ccb94f9b2469c01b4d327e5c0685bba31def428f741aabe1f922a6363a604": 5.263057138323923, "38b4473004a1066d53c93dcf0061f1d9c3c4b3fab9e1d86f66fdb9e8865ff57b": 5.756114415755812, "2f85b686f19382cd16b8dbb819cc4ca70e9736da46323c8b885352a5d4fe7108": 5.756114415755812, "32234f92a79c006e6027f831568fc099f6ed384a07d4732596acb45fd966dd7d": 69.67525181017214, "2fcf9e008d1bf6b953ab0a4832df945ffb76fc84de2cd8c58fa6fa723a8ec663": 5.756114415755812, "36a4c9ec24886be19483a85ea8000eee0ecac68ef6e0c90a46b04284d96c64d2": 6.719686371912561, "289f10529cd62f0b26bb3eb19c96e7996baadb8ecb3eb097dcfbcd2e8bb54278": 5.263057138323923, "24a2c0b7abde0eeb3ecfbc44c38184b5f5256f9faa93c7fe839d635f1ba3a3a7": 5.756114415755812, "17c1386fc7a0ae26bdbee017cf600b0aa3927fef85802cc8f4cf3d5e856f83e5": 5.756114415755812, "17261efdbacd8afc6ab946790ed837bedef5fde98de685c0b5606bc3da451dee": 5.756114415755812, "239ac3b0055704ab928088efb283dc6ae9feb25274ee139d656ed9e4d193d9b0": 6.719686371912561, "274927b9e28e67c1954419d501caf36009e7dc0d08e222224779834a55fdf30c": 9.089140561068787, "7e7ab052f410de3ff187976df4a61e51d50faea14edba3e6d24c15496832dcb7": 136.95295347404118, "171cc9a9024e152e7da37dfba430f03922fdbb3fa759f8d8ce3618bf3a767054": 5.756114415755812, "13a9672a68ddb52f71846a3ab6d3c8da3aafa07042ee2d0df77c8b53e8b3f4c5": 5.756114415755812, "08376d41f4126094bf4417fd681fb590e4ac96e0bdc62f2854623c7a5aab9b9d": 63.63793741171452, "0ed283489ea360fd732618fd9ad9ff479188fd3dded2f022d1370d338a555a3e": 5.756114415755812, "0ebd102498c82ad0018205cc6e6384ea05a35784542424f59c89a771befc1139": 5.756114415755812, "128cea82dbacba252f36b80828f0dd192cbb2f145c0eaef267f913b0e5ccdc56": 6.719686371912561, "03f8bebb71e1faee89876f65f539bcf18e538c342894cb1ff3311a256c57e904": 23.103178823089763, "050395f03b741bd09928a4565235a8e2c41d2e0359f1b3e8a6f5002bd68dc647": 5.756114415755812, "0ab36ebd23d99a09f5dee6cf277082d086fc7057f8b77b82ddbde5505b48bc0b": 5.263057138323923, "07199dfed0add19bff08ca351d50e3fc05c3cded24519076fc2516834d2c092f": 5.756114415755812, "7ed38aadc40b801ae09c3522d36e86d7145cdcb503bf702f4378cdaceb1138c3": 134.2767341288201, "e3beba633d316542e281af4e4bce5e587b773de274f83642c374862afebdf247": 35.778025812873096, "e06f7479d32a9360745a13f663295bfb2872d73f02d8581904b743a9c9af5565": 32.62574640672045, "fe09ec4e5f6fcf41ad87db12eb049d42263fe5e7eb979d1511c44a8bc9a5879e": 52.10063880532058, "cc9c0855da0bf0f3db5ae2b644bd00116fc5658bc0b58c63debbba9cca38702c": 48.34113701931127, "ff25571561cdadd1abd86416fa0a4b1bfd69827b0177e93d4b29021cd5e0eaf6": 82.07406974227474, "fcfe467beccd64b07e4c9a6325be7923d2f4b6bfb6477f8534b517eadcfd5e5c": 56.56707620647808, "bc4b5fc40ac9ae5f93a2cf0237e24f3b9f49a5aecbd87d910afa0d8ecd1b9564": 20.953689572683768, "b7a3212880af711d97e13e85bcd1e8ab5133f83a6ff116f4428dd764b9f6921b": 67.40277039891959, "a431f7750ccedcf305097e68d508c467ac8016e84f237a3243b400d148737913": 71.30075274048076, "b231a7323db1ce03707f598063ce1dbff97b9082554006f140ddcc088196965a": 56.873897795178685, "990629398fae1ddfe911beeda42a199b0e4c5a700cd111e1085404df76b8ac69": 32.62574640672045, "95c3f9dc662f1fe7ed6982cf896e810756fda1098742bf09659f05a33d9c790a": 4428.681879529149, "95c90e28c9777ca149fc99ff728322245afb98400d9ffe1bedd75f0067a4a69b": 35.778025812873096, "759a8651e072cd4de3a68cf655ea34016a9999b4a836141a996d022671d8618e": 71.30075274048076, "743de13f937735925bdcad34124376187ed9e9d06c95eb5421c65003a456371a": 68.61565538451049, "da368964d366827ed010eb0ed5dd009bcd4e25c11674babfec2160fcc0aea636": 41.256540790439345, "9334be80168248efc2999ddc0e0c3271d07ae0e78ef68e6e811e4beb01358520": 61.87495716553897, "4c8e1f68e63202d247e49676b27300baf3bd28895e74be5da135a175b71a2cc8": 20.953689572683768, "56b3ce3f2e5b07e5a0e0d60cb2773eb3266d2a58854246188e6a64160947257f": 82.07406974227474, "3d0b76f58fa23e56d9a6211a70773f512b36bd8fee305a047b706a4694337697": 1.0, "555b33590f7998864b2989a769fa51efb22f8f7c7c1ee2c602f94dae13385400": 67.40277039891959, "3e7ca6478d75e9ad3259825388f823d5421a7e2d67bb28e338bc96f986ae3b46": 52.10063880532058, "c6ee5ebfb0f62002f14b48d54a08896915cbd782f47d2b3423fa82707732f9de": 62.540476856086194, "b227de97f104a3aab76321ceb978288d89ba35f6b278db0e44e0c374cc9a6d0a": 62.540476856086194, "424268804f2fe3300e103ee3f06172b9899a7d97a36954ade5aeeb0336cdb82b": 61.87495716553897, "30295c6ff75b51b37177c70e52750cbc0b5f82ac660c0c4e6b13188259b3813b": 25.812016674595395, "1bf28e7c81dc845dddeffe3823e08c1c849ace19349920490287e52b7f69c49b": 68.61565538451049, "10735eac785fd9dc2290429e8920742bfed2dbe001cb5550c199f5453f720816": 48.34113701931127, "1170b8b0f29dab898de2c70541772819161787527d9a810bef27730b31915180": 56.56707620647808, "ef7905bfcbea9c15846e1bdbbb81fdb676b9def3a9934660afebfc16df145f7d": 19.616457968159157, "fdd2fee81d7804cd46050a339ed60080c812887a8e3fc46c23be92e9b33d1a60": 94.97438159617472, "ff57447343bc5f4777594cadd26ff93d68b1b431d3a3209df27135adee69aa85": 61.05561917655177, "0c26174c1fc58a41e5c9ed2e823aa8c2b3d54d5563288cf8a38f16036df3d452": 25.812016674595395, "59b3da4ead2ace21807e9527fe4ef2397af283ccb311b038e26503dc33e84d7d": 41.256540790439345, "c6ba6a20cc49211fa46fd443e105d986894d17c422f33b2fd5f4cee79ce1a896": 36.603607532186686, "0d50e777b593808e2c3fa81ffca30d71900e48d4f415e10f5f9b71fc67fab54d": 56.873897795178685, "99d229a76a797e11f3ef226081e5523ce867b8a62567aa70a32f02ed629609dd": 21.52986567030294, "ed16a9d8b9e28cee17ad1f2de25a45dd05d55813d7eec4b58cc9da7f47bc2558": 29.43518093400973, "c65b8edafd1828aaff488f2f9032a4dddf191bf6d27c4cf079371aa14628c2da": 16.978582999500468, "c6ecec0b18332e60a9c6beaed199792883017542723cb8b3dc301300463fc269": 13.393566353593544, "cb52c9ed713adc94b8dc3f3995568b6a086c17c7be7aa4c02286efd2b57ea6d1": 52.34457529993926, "90f33cfc36ccb1e0fc7e33a11d08dc1c9efb91f25b755200cf794d5fa1cb6813": 19.507224927089627, "b21138edc9d57c0acc7d1934b4a47d435cef9154efa968637080e9e4f05f2a9a": 20.37564862301857, "8f20f5aed4203fb9f7a021acd7b20e854a80b7ae7416c1690c246b22b1aadc99": 64.53647370376414, "ae2ad7cd06ad5023554ff6c2e866cd88315a5863fbce6344e698d5d307c1874a": 24.31443041034477, "31fdfacaf03c92f111b2fa3f11301748a6013701ae7a26910a0f3f79367f9800": 44.01591118774173, "a3130a5d73494a31d948939d1b63385682e0da41202f31f276c33a460a229e25": 53.79547653524554, "2b1d10e4f34543bba1a2d39f96b880ba034b7e02b8ab584ac40faa54c8c7db71": 21.52986567030294, "2d4102bf2e9358ecf7aac599eb6cf15fbe6399b5350bb4fee7d882fad7bcf769": 43.685706244399, "299526c38707f54cbf4e4c0733315816c6b72fe9b1ef7179b04e9262785be1d5": 22.653758896975997, "941fc7aa41b9ff58afa47e51fa9cbeed570fb3fc49661cf1d82dc7f2176e5ca2": 13.936827476576818, "6c700e1152df5b57b5462ccfdc49630576ed3c0ef646dd3098ec60871fd2a543": 129.6328441452158, "6201ddbb35f2b0277ba8d76c1b8ec09ccd217c7fdcd7a930ea07024180751f3b": 15.58070223131001, "58ef158fe4578f461f6793d92b0ecc112162ac6918a42af9e1e83300eef17a7c": 25.495002926499463, "36579ca20cef8afb1042ec7e8a10607b7961bcf6d68dc8d07269fd363d49dabf": 22.723940900965406, "015609c17e57bf614cc7c9985f4e14539c6e49477eca623df9f9a7035a017bde": 70.45927406461834, "174901748106109f8ce477e44fb70d81a352d9ac5eea01bfc9f838920071d21b": 10.976865212283467, "28e9654011fcfef36cdc6ebbdc0fb2181dcc92f917a5abdda36656f0b800d1b4": 14.389816693266903, "0c12abd07f75e696b9e9ff29fc03fc0b192a95dba7794c8034a75e0ce915d235": 17.020765132086925, "0142f2115e73a5e242d88801b49dcd26333edc301abefeac5e3a9d647417bd19": 58.81687908868286, "090d89f42d126eb38bca96262ca4e9e9d78ed3c1f5af2f5ff01ca3394b43bb2f": 31.175231156603047, "0d0775d52de8e80f14f009ad7b54381c9104894e91077483644d7371beb6243d": 41.9847898656907, "ffc364e36d113041b8d4238ad78bef9e08a36869264e2538a59678d39ec66f63": 63.344598810365895, "02288be05fd57c7c0aeda2592dcc2e1c2af12926f27fa703cc42f09e8fc611a3": 15.064311642812767, "ef221d4d334ec54f6cb43170df33f8536818a01b12079335bdae1ee0cf0d4aea": 55.5117123375655, "ec9df56b63121fcccffeea6d97124c4de3fc0bb91088c3d98c23b3aba04c0ddb": 26.606038403688853, "dde60e073f37bdcd34cb788430c1bcff0bd01cfd929f8ceb0c66b85ffb2ae40f": 35.35698485240001, "d14e6bf4d522c3ff7996f15267f2a08acd8501f50803457ca25a39267a710542": 58.03354549050647, "d0e6c7b185c1ffd6686f3abef8d95ff8a2b3016ec651c19608f65ed34b6c3fdc": 26.684069672939344, "d25a4e962cfe0375b3b09337d061c88523f403356696acf75fc8cb33315bd724": 72.79066007518534, "ad14a687cc0adfed6d156fe3c7d5001df8a23a073aa2a5c897f629d71f5d84bc": 1.0, "d08198f49d8dfaa250c819049e17352f0f1c096bbb8e3e617db2a8093c4ccf77": 55.89888344335565, "c2ad0d620983cb631d67e693acdbda4463b4d3f9d126f32612697b35f7d5e3f0": 26.987548949231382, "b7d25e2aca05d0b9d1b1f8da189b774f424d33fb14c939892ea650204ecf5102": 26.60852954074133, "ce88f9a0a67a597a3e91d6e4019b8ea5abcb77dfb53d34bdd28b8dc7182ee4b2": 46.33844624487648, "b7834a8ba3a2960e9e827c788e221978050f46eec1cafddb8321cf03fc35ab8e": 91.18612915503554, "b5e9b360d0fbca63088e453579f158090a7c561b1b3ee2e8dda47f313565f3db": 39.49274443154211, "aa4dbb7973db299f13f47d2cba40e668f82aba917e169227f0f3910f97ae6f3c": 63.824322321310746, "a00a906cd9c3f07d93ddba8615a28e76fd1df014fa60bbe401fba51fbaafbc52": 40.832419610603544, "87e8d23a1149e93867e66ca8ceb7be256ad8f9f28489929c9c20493e082778b1": 55.48284372477995, "9813a55613e7a6820e0883cc5389b6dda169ff316457d9ab942d681afdee06ea": 36.15403453829987, "8e9ec8039bda37c517f6e9a9b9503e82adc3079a62275066b3be0bd55df803d2": 39.625756920232604, "846c52134cbbcac08999e6aeee300b0f8d7e570abb2a02ede3fd0fbf07e1b2ce": 58.379177644160855, "769e4692c5a15eec3bb6c59880929d23c4baf34701e7579ddbacf9f4076e1862": 38.552950262546574, "a5491097f9f4cfb78aaaa104cbd24f5498b0910644bd9dddf4f7e9c18f83ca70": 140.85164875194508, "621a845dff373bb3c0d745599480b5ea94905cd1c9c05a5665dcdc549c976cf0": 26.706600258773634, "6acffd8f77b7132fe3c7049772ca26f6710a4619ec1305195ef9691bd036afed": 92.32701540604386, "65c969e58e4f1801a92bc5d51843cb3a8a7c59acb16e72d863a0942f9dc0e5c7": 41.79097477000071, "54e702a31e13b6402ee1d84caa304e7456f0a540ee87fabb07e166b8c2dac7e1": 53.09792999157601, "5c5965b34bf7f98739fb34d7a7a4ff4f7cfe724203d297d3a0c28dcc3df9c897": 113.74475569872874, "4cf77a462c7b163fdfe0227e3eb5f5bccb6c1951da6d7d1d773e2d7502580136": 36.34793079665684, "47963c35037b55071ad8ebf753e5d2901ffe9894fba7cb19fcdf779f42648cd2": 78.1995193509912, "418a414552d28df0c27d6ccfcf909fb61ef3708bf3c55ce0088caf17b82c8797": 67.71363832981024, "3fba344e0fbf60489a54c7ec9c35d08ca2b7dd50db264ed333b8de57a15b9e09": 63.29060134566992, "2c1a2e2229b3f41d9d1f4a3dffca0136d472695604e8a568823cdd5f4b14843a": 54.878533583767016, "2ad960fb8b7a95a148dce43bcf226c620bbaf8f84b2bbb75a03579379eb265b8": 61.38495795736299, "23ed86cd79b40655df50d89c3ee4ddf3ba25b7dcf39370b3c9f09251ebf02065": 56.24736969788516, "ccaaaaca541d63934a3693ea26d895646afc49db7d963053c37989a0cf9b8511": 43.261782641208406, "1ea4f20070d239bb1a17f38f062b3e86fafc0d72bcce48ec78a3a1f87a63b5fd": 48.52638937085679, "1bc4387138b702bdb683c49cadd29a5bff121b492573b1c622d0fdea8e1ea668": 26.74086360182068, "44db14a71668763b8be9674a7a264033d75b075c1ff2c44f5d0ead41e3edbb3b": 140.85164875194508, "ffbec10a445066f1f2231f1c488cf1a634c3e23478e66dc9ecb65ce0e0dab0f4": 9.15242040106613, "b4fba01f563bbfddb6cbbfad21b9c226928ea2ff46bd9a3c515dda9ecc47ae35": 43.261782641208406, "fedfbe2fcc78d20223fa469e867cc7cbfc9188923dad60189f6a55270f5f4f0b": 10.365470610485469, "18bb009557eeb7d543b8f3dd904b48849eb11c381a85c8ba5ee1145b270e47d3": 26.74086360182068, "fbf6b1ba422281c774def4fa1b428576a32368d6d28f260daac9ec344a3aa287": 29.056016318981932, "15cdf7c7a1065b1fcc8c2381c2a7b4e6099c3d3e0fff923ae99f195c0b7c6d68": 44.56801125895459, "1a77bf9a4252b2139472d24707179920a06a217e917ee1d7db590fa7c421f4c1": 22.989070311388925, "f71d4e579abe9f98f966dc719ac841bbf1d4787cc451b53d12c1e09bda0fef34": 21.127343954401123, "fc4667ccda7d028fe4c0d4fd6acb2392e7adfbb9e26f4d5841f5d6721bbc4a6c": 45.15486452418983, "f945cd456c747c491350cc745c8b6c71c827c121a62b6de6a6b20b450b2ad9af": 29.056016318981932, "f62e9a1833f31106459515b97222cbad6284b25937fa9e125897f41b318fc14f": 9.15242040106613, "f3f34bb7ccac4688139416518b436b3a1131087279a4283c02ee84838cea8a5f": 20.774575653653834, "f99f3f76939d72387b8829347dfd28c7ef050bd793bbdc203b3419d2c0f86ef4": 44.62679812661854, "f95dfdd4bcefddfb5d78d328e9ee4a068ce0e13d23990006a23687884075ed9c": 44.21233066616023, "f68944988333c36a689206c45c9df1640c767da20ef4e7a11da12ecf07c7812a": 29.388236241590494, "f3bc981bda8f8da90dcb852f112703c5ef77e01d134c47a6ec6d1400635783a9": 22.75456893072957, "f02b46af84a38f341c139001dee4faca97b77d4f7bd239512d40fa8544ae6c31": 20.774575653653834, "f1df17c494f37fe7b5695fbde057da72ed51bd65d32358a6ed56a7be59c5ab4f": 23.085228542683236, "0a41b773c20d3ce4790914ae86f92329709a6c90996253764d8be99bb53233db": 108.59554138437265, "0f5d84acac73776367ff855c54e889341ee031f505315139b8259e3cd8c0c589": 76.29477617867975, "eb219c76a74c5460513d48c006e80ffafe3cc7958cc3788b0562c7ac5642d4c4": 23.085228542683236, "e6368abbfddcc650dfce596c3fdfc566499e9938566894913870f1fc0b29a022": 9.15242040106613, "f3c569118d18b428e51b9ba147d060f805427264c928777d577e895030b62a35": 42.841604618072324, "dfbbc47725fca41ea22f5ba841aeb0f78824a3f6b1bfcdacf29a0ec7fe290c22": 21.127343954401123, "f377e512ba1de455ef5fa4edc8a458d68278b11ae9754414c830e9c621fce4b5": 44.678579276478565, "f3b8934032c1a70b1ddf24da6b52e176674cb5a88d623296917682397a65f755": 44.874993226010446, "e1381f7c966c279c42001bfecc77cbad8b73031bd8023f663e5ab98022f416e5": 23.085228542683236, "ecba3f2ac2d44ae6be2572058dc332d0a58fbab28f790ea3022fdde6df2c2c71": 48.123678365161254, "ded4ff7c39cb38295028e96c9ba1c2766cec32c37d4a686f495670c2d451eb97": 20.774575653653834, "d7ce6b394b2108e016518db7d2f03b1a59af3d70d52584ccce60bfe986029c98": 9.15242040106613, "dc4ae0d00fcd90830b29c96f33b348f4c46dabde4cbddcf1c7bd317be8535477": 20.774575653653834, "d8c697867f7713af0d5d9a8519c3261be9de8868da419cc323e140dfcfb68d18": 21.127343954401123, "ec5044601f284fc52e174a2ad232da8eab0018335f9afa36cf89262fc89e34f8": 44.93469844739138, "d430d40a3bc47aa01ea1cc92d832bf819810b8907dd0d29c272635f8dcca03ae": 20.774575653653834, "d050638603ffae293321f8d022601aa793a98a0abbf9b6bed61f8f47f76bce7c": 9.906454855893644, "cb6b6bc1d27773a3ebf5f058e5a1d99bd1d189595b6e7415ba914858514e5d1a": 23.085228542683236, "c16d7de204c6243e0063d679508ebadcae794ae9d42f3f7ad3719e54a03b3797": 22.75456893072957, "c5888943832c23951441c64a3aacdd549d1c9e13c6f589ee3fc75e0bec69ead1": 22.75456893072957, "c6d2fe56791e6929cfaabfc79868aee971bcbb9fb8a709844a90714bd264d95e": 22.75456893072957, "c5ae015083e8380a006c06a9b5bdff012ce680cee69112fd955f542447f43ab7": 28.722142684233496, "c43dfdddeb9a312abd32cf23d9e387983a1cb77ef6fa9b37c495d93ccb9da2f3": 30.021887081079377, "bfe0463189df56931dab4c4e2f728c6ab1eefb3b59b66b2d86850372498e9da0": 22.75456893072957, "c1084cd279e230a998c1faefe270f1a5ac64fa14004fed0153a3bd98ae1ccd91": 29.056016318981932, "d1c0d73623556be64e29e290291846f77a94a161908ee1ab84861dab76568aa6": 43.12160181642201, "b983f462c4380ef55e4af158c1f69e30f3ac76748a5fe8d81f3d0005bc2aa557": 20.774575653653834, "beedfd85d4a78ec65704931862de83e40345f561366adae024e74da1c5720994": 29.056016318981932, "baaf10d91d2de4d3eb6fddf9b5e6afcb3749b25fc4fd9ef28cccd286adbc2c3b": 29.056016318981932, "b7b44ee4f343740e699ad8ea92cf3c4f2980524eac7100608995b8852fab7f7b": 21.127343954401123, "bdbe431708ebe15e80593b4d16a0835461c95fe198955835ae9228fde0e10809": 29.352961594554294, "b9f5422ed75fda0c53e942db6c400c6918cb2ff8d083cf31faa902d91d4a17b3": 29.388236241590494, "b22f78611b57600c11b74f3ac20530a3daf3bb56538463b765001ad61d42f575": 9.644136059253206, "b412a5f75935b1af9f1bfab1f113e5100904e5c1a110853b1d61f72ea2296a34": 21.127343954401123, "bfb7a4b6bf20845eae1e7bb19cc8516f13dc9405e62de6a6b644ec94dbef8bf0": 44.21233066616023, "ccc27a19dcf756da69b3c62859955e8d9a99634f0ae199c3da040437bea85b7a": 49.768293345860876, "b067696be68193ab9777fa70fd874b7dcd0fad15e432a0eefc96d9f17e4df9be": 9.15242040106613, "b03f45c41b29aa2464d3b748aeca08c47737ebdc183e139f504779b69099c4ef": 20.774575653653834, "b6f9c03f15b279f734f2dd1c7fb152cce9de4c2beba7cf5ec48335c9e99aa7b0": 29.692243007352676, "abb41bb78a1a7737e5b26b3cbec4d292cbf6837a7aa3bb3995f28591f2f7523d": 20.774575653653834, "9e83cdac5e5975c876a23105679d16dd4c2ddc0b739f5f12bb64303fb7ca3cc4": 9.15242040106613, "9e537babb3c4afcdfc72e675a6963563af7986573a9d1ee58b9061cbf3c7e8bf": 8.25710150309047, "a2e9e0995a3fbf23872205011cc533c6bec3db0c5d2c5ad15ac4d2180ab9f0fe": 23.085228542683236, "ac701c911bc07125a37ede74b5a8723411db301bfdadc943fb5f04906a1c6cb0": 29.056016318981932, "9b94dd1998b93f92e740fa007aaa58de766bc679a3c82ab88913084db0352869": 9.029279288776788, "a1f7b0a6017bf6bbcc6887098f955b033b1df613f6919bf8183eabf6b66d71fd": 29.056016318981932, "b6471e1deea086248901f8fb6e61de6f7bf9a0d61cd54ce957710e431d5645bd": 72.0002012042772, "9941b1f791817590318f0a825eba5600b8eec5d93e06c65ed56252f8bbb181fa": 9.906454855893644, "9848f776069e6dceced891156e54cfbe488eeb4897a49ed03b2e1b5665b3ca03": 21.127343954401123, "aca094d9984211da29d6d518033e3f7f4dba96e958d71e93a25b71824c2ebe75": 48.453924867523526, "9546d306bc7a61023bf79122911271669c6afbfed01d2c74a17f27c4da700cde": 11.969163848589885, "975f181e942d5e18d6e6e88b1b1617f7f3e0ded78cc54ad13bcdc31cbb49a950": 9.470545819027635, "a6ba1eb51c3585dfe9eaa18ae0a6d154cb25aa0718a8e201d516eeb7275b56ca": 44.62679812661854, "9071b7387524ef8c98b826c2b8f4806a3f5355c3068c792b2d1b321bcdb483be": 9.15242040106613, "9011b2d3178b84b2914c06cfabb9423a98fadf8386e33514a183d38e2b2b7df5": 22.75456893072957, "8a2a8a9e306e0fce5d8d2afe5f3328cf818779a627321dfb081ebde47ff529ff": 22.75456893072957, "9b5fc46b9a31b80ac0309e3ac5769bc40aa6155246ffc2a622c1482b9c47db88": 44.76276136629975, "950d7cd54e835ed6d04828a642ca6cbe38504e2c3b22e4d3e0f4b221354ad0c4": 29.056016318981932, "9933755db05a2e3e6560cdab0d46486151b6de1cfaae403ae905c8e6a8e9f76e": 43.83453646739979, "82433d1c0fed966396ee2d9cde60d5999628722d5fc077008d2f41ed2160006c": 12.72444351659991, "85c237b88bf35a4e211e775a15acb00351a0cd02fa39cb83b9e575c1e2fa0240": 9.204481888766592, "8669d0b74edb8184285cbf643ca89906e1775032ffd9c86e9a47a371eb33dcda": 20.774575653653834, "9523aca563c2208da0d7d704ef79c4978184684906833e5d947587fc232c3cf9": 46.11227189718904, "805a4191564e2950ff2fada9b3b9c9e710b58aac2729fbcdbc682cb787a34e19": 20.774575653653834, "97dae97b443905bd9f915d4684debeeb92160386060e849c4d9e182c0be27ed6": 44.62679812661854, "8118b0ba07f2151dca343c2043c8c2d3112454a185e8e82f465b0b17c796b16c": 23.085228542683236, "761d44ca4a74ebc0bd1893ee5998194bcd87ad1ccfed8f6ea8de5523955f5415": 9.906454855893644, "835a0f2477fa2832b2e65b22aeb846e7f028684e5bdac06fde8da232d726fdbe": 29.056016318981932, "7ebb89a611c0041315a212ab6c54c696f8ffe54c46794b5cd5e713b662a59ba6": 20.774575653653834, "88af7ac57b39f6a277b28b5c7f70cf4f2f53707d3f2ec3524bde39de963b70c8": 44.46147483665518, "765f3ebaa8e2ffbc5edb9ff532db1634cc00b57b963e81c9071f394a34f3c401": 23.085228542683236, "742527e62bc3d6d06dc5d9cb4d947f008d827598664c29832f4b3cb3439dcfa8": 10.365470610485469, "7bda5dd1e086f7f6510d6e063d2a79b827d9a3b76a402532b0a519c792d554fe": 29.388236241590494, "77b6330b7d5ecf877bd75aca819e5b677dc3bf43cc449c91bfb04bca48c5bb1f": 29.388236241590494, "6d5184b121b8760cebd92253de0922371eb08b21106a33ea70377039d78bcd51": 9.906454855893644, "81559873890309db2249d607b9332b126815a36e4882d6eec42fc8296cfe0947": 44.62679812661854, "71d7c3f918f3cade900f892d5f69797280469b9d3110f1c0db97d42149b39b69": 22.75456893072957, "6cfa759e2cddd69cb05104c0f0eaed0d2bf5e8120599e8622f87caad0c7d367a": 20.774575653653834, "6d45aca113050b55e27dba28146b592ce23eb855ce2ccd87631f31943257d920": 20.774575653653834, "713ecd172875eb845562ad7579db684a14f650a3f6789311f6cfcd3b303920ca": 29.056016318981932, "61ae45a5fda0c36c3b957aaea4938688769d753143ef18ac51f66129419edd9d": 20.774575653653834, "6b6d1425205640b969874e5163d413b130d9335bc2a788f5dbe69a12852d06e2": 28.3887902509193, "6c436cfe108e2618885f91bdacc2355bfdb0baf3372ad4963220adefd23ec700": 29.68400343897251, "643dd023229c412d6e84f95005d8c4737a76c032fc985e472f5008a2cd8a4937": 22.75456893072957, "68ed55ef3bb8bffedea34fe998ec422cc446777138be5381c72c112f6c4faade": 29.352961594554294, "758d5ebc666374a76bd852e79b6021516075637f57e4899225020e384fce29a4": 46.03789854557223, "674a189bb0011d59fcaeadad01474a2af1b7b738c5d22f863443394b7a1e5488": 30.7316027846515, "5b615a44f3c51c8a75a508ac2b882e4943dd1a9f6e422f0a5e78397885af056d": 9.15242040106613, "5e02daa6d81ebc6c313736c1d1d05b441d836058570439cf50581bb553c118da": 9.644136059253206, "6175ace80b929f14453096097c3191a199a03efbd97741c0a899d6c2d1157816": 23.085228542683236, "5877614c5444264deb1a571f863a0ef83b4319c5f09226523e87634e79f0a9ef": 23.085228542683236, "56de4b00c6c4076230f80fb00a5ffe027cbd7988f5fe4dbc0a911030007ce4f2": 23.085228542683236, "59a20da8e24c75a1cbc7e4a75abe4958783f49986c10b0d71aa3eaf243dceed7": 29.021800427244447, "5261dafdf10ba9e98791c06020116d8886a55396cc27979bf7d225607ae1efcf": 9.029279288776788, "55a2b0e93d03b2eb2257a632e0e81a4c92e8a17df6d625c34778ab2f9167b15e": 22.75456893072957, "50f2de336bc754dc7da4912a664f0bfa1f34b8b4a5f6e35181ca5110d1ea05b4": 23.085228542683236, "61a216b322fa65cad8224de76cf6ed1ecb6503975599784800f6c73eb0d7ffe0": 44.21233066616023, "5a833435b660bc801ef6ef7465326e4553c1f82c2749c000b2bedb4a00dee93f": 45.40316495769846, "60f5b74eef997b275d44352ad292ff87e0b1c3ac2c0cbdfc4c1563c6f21c9da4": 42.841604618072324, "551cfcdf5dda995909acb6d85f33ad9ee7edd017cc4ed3d5aaa0d26bc1d08046": 50.39100762668442, "4e7208386e8791260b1c16c965edb0b5160bf689ce8b345e2f1d77482b324f88": 29.68400343897251, "48e6fe32daa3b1c42d73d9878ffedfefc259d0f9e68fed7bee5beffea5132106": 22.75456893072957, "4d047ae965feba8c983810de5d09667dbce90a319d14614ec9e52dc688c44b03": 29.388236241590494, "46a9ed0acbd307b4861a2f13b4e50b6f72de3a80fb6e03285c1393963b152c1d": 9.15242040106613, "487c403b445d4f2e8523bf11f8fbe73b5a359f58dd0e6b535b1687eb2aef650c": 20.774575653653834, "44a69557daa28de0d5e0352719e398e859093154416d757134ada24f0ad02c86": 9.906454855893644, "4f3a9f7adcaafbc4db2923b01fb2e922b5a6bcdd1c6ae7ecfd9b9ff7e1861a6a": 45.29603170023727, "449b07e2e6cb8810086082eecc98c216983fd3fe0998b68f6cf080bd8daf3d22": 21.127343954401123, "42db35c21e0cdfecd633d316c56cf72b38bddddb19566deda5297f3f0dfabac2": 23.085228542683236, "40fe0e0b78d9f1bc4f233bac001e1761c409ce70d718472d86bee472b7e477c5": 21.127343954401123, "45c4d6e7249e4c9c84ffee2fabff9b2bdcd1b8a55f747c1dbf0d03cc3bb67d7f": 29.388236241590494, "48ce8482910b44a1410c0bcd08a8a6afab8922df9293cfc6deb8046e4a15a19c": 53.37008561826078, "489b3aa4b0fce763b089ed3d31817a4bb8bb3b7ad140352e7a7d80ee400219cf": 48.23112868706284, "3b9f9c26698da8877332b887c2a2a4cbd103fc908e3b89248edcef9ff59fe84e": 9.15242040106613, "3c26d74283d877a91992d7a76690f82f07955644af2d462b4b4d12c776644fe2": 22.75456893072957, "3be72d1f0133620dd0f882dd72c690b5246c523e36331a01321d1b9550a3dac5": 21.127343954401123, "463c716a5d345b2ed355eaf2e192585b765fef97161b88eb2150ad010071be3f": 44.93469844739139, "3422df3ac6b26fc348b2729856df64c714a43c36eccbaa8ced46bc4de2680bb8": 9.15242040106613, "2ed74105f9957038b151adf1555fa40f71e77331d3773f686008a234458e80ad": 21.127343954401123, "3f04def7b7a297a8c84f849bc6caf52173f4cba3d72eccacd9a6befd95cdcbfe": 49.536311060501326, "390f67b6280045e946c0ebc061479589baa5ab8440e127254eaa2640f43f7407": 29.056016318981932, "2e07577b9c2cecaf7fe48c5cd3aba7b555f2fbe3786e2cbae33a0b325c683bdb": 21.127343954401123, "32dc32daf2f993e5bb872c0ac8ddcb2e1a8f38d4054e3c5f3f563a6298152668": 29.056016318981932, "36579fcc1021fabafffa1b1735ddbdd2972d44a75c7452c5e6d6d77b680c1efe": 29.388236241590494, "2aafa8397e29c918e9a3677949f58b3aa88366c56cc478bb38fc16f7863ffa38": 23.085228542683236, "3c36e7bfdf3912bc2e1b064a8311d0f9c019e4677a2338f26c968e540fc8d5ce": 44.74237380610771, "28da1df2c385d9a20fd5a14486161c8f91197db46e7a0b5e06da27c702fe3609": 20.774575653653834, "27f1c2045db17ed7a6703ac1f6f7dc495600662cb9d1f42ded79c3a30884387d": 9.15242040106613, "1df8b6015957088149a69add4821cb1cb89b7afc298cbde9aa3a7c9494aba916": 9.906454855893644, "1efe3d9ffdf7a21372ed76eaf6e4ab91f70297c1b5dbc77a580cbfdf7ed36b28": 9.15242040106613, "1c92e2ef53a1c25d958a1f4c4a4a787b38317828c72566b6cc47d75ce35026cf": 9.906454855893644, "1fb88c51e7f9cad3d5f47711be2332e01813efa85c168fa1a1c6fcdc19c5bc7a": 21.127343954401123, "1c2c86321c61dcd113bdce32a8a6c283e094807796d6a34837e715b5c482607e": 9.906454855893644, "210ba732fcab026e563d70a57e8190e4bcb83f21175382cb0b71acacd72f0b20": 29.05631744215075, "18129c4d0f661bed1187539469caea080a8847dc7c823c91c3dd24a4c2ce9a4a": 10.89081012759314, "19515c3fcfc63b784155f80c980de498f38cbc9a7eb2b7286b8aedfa737580aa": 29.388236241590494, "2a7882316861e3722d4e0af6664f2bc19bf21c6ba689d3ea19cce41200e66cdd": 44.79670891410795, "1697d55a11db1c29e4082b46ef3c00a8f5f98c0a670bddafae2878dc788de74a": 21.127343954401123, "18b890bbfac268388ac177bc4386ee6fab48d42896b7446b151bb9c4d146e300": 28.722142684233496, "18bc616b770025f138899d894df98452f494ff8f3a25c7181e2be4ae79cc8490": 29.056016318981932, "1cc2673e9e5976615df163a530a3f26077af31d351087c2fd66e7e2d513db8c6": 41.75888493125048, "1325944640bb31c4922561732452b232f31ad29382192f9e1f8e57216823a87f": 21.127343954401123, "1162ec35aff4085988800452e535e264fa0b8ac98a002d7a444487e0fb6d6bfd": 20.774575653653834, "18c64fd22522a3b6c042272f7d83c09f4c20df941c87dfe44d5358b00f0857a6": 44.46147483665518, "1535b2728e099ba0b4f37f14a019c5b0ec9536a58dcca4a3e919736706ba7003": 29.353402315484487, "08447fa1611f87ce667e9046f4819d5d31bb857db27b5154b39c87567bd21f29": 20.774575653653834, "0fb1432d17536b084e15fd25f83f7bf54e9aef40fbf3de78bbb6355ba7de3bee": 22.75456893072957, "19a6c30c3b3632a4fcadeb59d3601e250e4ae3fb66ae731ffdb2d40a4da2a074": 43.3540903389824, "0bb95f43d56f4f44b65c22cb69a0f9a07bceeeb8c2327f8c38022591a4a2345e": 22.75456893072957, "062fd29778b6343b039019b4713b43054e3e25d548d854941aae746499bbbe4a": 9.933157995605066, "0765b3c4cff566f76d671eb6853e287a3c12bb248a541adae33b380c7ec73fe8": 21.127343954401123, "001eb0af73e9996c32162df3a5907f9023ffd32791ced8862081cb47d8821a80": 10.198764329444513, "03978af66d5486871b95176ed09e949101a87fcf7f410d6f577407747e4b156e": 21.127343954401123, "05fb7b1fa85f28cb7a0d0d5eff2a59a3ece90f87921dfb2066cce699a27f9b4b": 22.75456893072957, "ff977df22427ada6aacee0f094bd85fda4ae7006fc409960711ce9959da9ebf3": 9.964349716616553, "9db6be05412c831c65599407f80f92ee2d45ed4d1ccadfbbf3f517cdee51c74f": 42.2369818937523, "fa02a29da1574e7181887efea3546a9d88d347e27c95484e83773023a9bf97c2": 9.964349716616553, "11d83d3e071c33ec09db6c127bca8ac0cd6133741b8f174436117ffdb7d5de5e": 43.22680596296265, "0cad2890bb72f7499c71368a876fe79167ec651b599678e98ffd51e260e91885": 46.970461961641384, "ef1e6028b6495f0638119b8e724352149ec23740428cabc2962d2f9e26fb8583": 29.897091741342642, "6fb75b03a41789f031141af612f630756f6e4de8810840c79aaab2c027a4d845": 9.964349716616553, "7c65410d769444ced5f1779f790b0820b129e7f288f2dd8d2e577b7c1d3b955a": 9.964349716616553, "9318883a17841e7d6536d487b5a11a0f68e54b5268feb61d567b8a6ba2993ff8": 9.964349716616553, "fcb715d440b2e4d2b45afcae74a8732dbede2fb8bbaf1b25f557663ba8556d74": 62.632102766603474, "ea5d93c758c4bcd3b3065fed56806bb9eedee2a8eb0a3bd3137198ac0b437e5c": 27.856283288789843, "6d0828ad3dfb8ba58be60b61adb2b9ae55cd9f5ec11a58d992bbc2377ad4d42e": 9.964349716616553, "507f837455f458658fb55ab92da6880522915d2fbb85451dc01671d71353c486": 20.785623810827374, "5843e47179d20d8855ec4e79965156bd0788c22ce892521948fe99e2a91464b9": 21.291598588247307, "4ff8a28d7a42d75b8a229d2a615313ac16eb5badd912389089651a4c17e3aeaa": 25.869490053076245, "6f7f09c4543caee27e2a5e1906d11ec0fa3a0f4f6e50e4893b37db0f4489f6c4": 34.05238974226952, "10103e9010b2d439e9aa8e50aec6a4cfd457997850355dbb76099bcff2797412": 9.964349716616553, "2bbdd54dfb563668c2cf56116b3084c9c5256db0ba27c8e1096674bfdf4f40e2": 52.74513259989021, "0a367a3b8605934ebc14947def30eb32fe4bbc9bdd3dc4654efb1c73c522920f": 63.2393021745371, "27601e6a4b6e90bb9c919b6a6e2ca0275d85bb1b9c6234080d06d1f2ccd59a5b": 33.751569371306566, "06e627be48c0356f8c1414dd1e44f7d1e788ff788f2a09f3f32b4a019bba0109": 9.964349716616553, "e3554fad67505553a9742d31c4b526701f9f4a8a0a0eedfd9dafee6af43d59fc": 29.921025404224675, "f19ede361d73dbc3cc02cb8c66b285552ca72681c7f6d21aeaacf77490a78b0a": 116.30247780083816, "c6e615a6a72d7518bd77857d8651f2c04b35207d9039ebf3cc1509ea76202013": 56.5285490558419, "35f503672017253c81058a83bc9ad0adfe3aa6f3933a4f232a93b6a5cdfd28cf": 59.241376223820275, "0f274aaa945c05641a9677b951c32026bb201ec9aeb6e691fedd1235b3a5d6af": 30.886771469122486, "7a23de605f63cf4d997c135aec35ffca3ddef439c383a7cab9b149e829d4e3f6": 56.16082992532878, "2b8802db76f30d1ea6cfc23048d6513e8a23dbf307976fff915062100762d99f": 56.5285490558419, "192286a9954a2917a50ad6d5bb1efa61e2de5e94c7e9763d0d3c6e985677c6a5": 42.39957890627398, "906c24a2203dd5d6cce210c733c48b336ef58293212218808cf8fb88edcecc3b": 65.38624892493604, "8ef6d99d9f9264fc84514cdd2e680d35843785310331e1db4bbd06dd2b8eda9b": 65.38624892493604, "9a59f63e6facdc3e5fe5aa105c603b545d4145769a107b4dc388312a85cf76d5": 65.38624892493604} \ No newline at end of file diff --git a/indexer.py b/indexer.py index 9369c4b..c833892 100644 --- a/indexer.py +++ b/indexer.py @@ -10,7 +10,6 @@ #Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly #Data input -import math import json import os import shelve @@ -18,7 +17,8 @@ from bs4 import BeautifulSoup from time import perf_counter import time import threading -import pickle +from threading import Lock +import math #Data process @@ -34,235 +34,196 @@ import re from posting import Posting from worker import Worker +class Node(): + index_value = '' + postings = list() + +class Index(): + length = 0 + index = list() class Indexer(): - def __init__(self,restart,trimming): + def __init__(self,list_partials,weight,data_paths,worker_factory=Worker): #Config stuffs - self.path = "D:/Visual Studio Workspace/CS121/assignment3/data/DEV/" - self.restart = restart - self.trimming = trimming + self.path = "data/DEV" + self.num_doc = 0 + self.list_partials = list_partials + self.weight = weight + self.data_paths = data_paths self.stemmer = PorterStemmer() - self.id = list() - # list that contains the denominator for normalization before taking the square root of it. square root will be taken during query time - self.normalize = list() + self.data_paths_lock = Lock() + self.list_partials_lock = Lock() - #Shelves for index - #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html - #https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466 - #According to this will be how we split things - #Save #1 = ABCD + (1) ~ 18.3% of words - #Save #2 = EFGHIJK + (2-3)~ 27.1% of words - #Save #3 = LMNOPQ + (4-7) ~ 25.4% of words - #Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words - #Save #5 = Special characters - if os.path.exists("save_1.shelve") and restart: - os.remove("save_1.shelve") - if os.path.exists("save_2.shelve") and restart: - os.remove("save_2.shelve") - if os.path.exists("save_3.shelve") and restart: - os.remove("save_3.shelve") - if os.path.exists("save_4.shelve") and restart: - os.remove("save_4.shelve") - if os.path.exists("save_5.shelve") and restart: - os.remove("save_5.shelve") + self.workers = list() + self.worker_factory = worker_factory - self.save_1 = shelve.open("save_1.shelve") - self.save_1_lock = threading.Lock() - self.save_2 = shelve.open("save_2.shelve") - self.save_2_lock = threading.Lock() - self.save_3 = shelve.open("save_3.shelve") - self.save_3_lock = threading.Lock() - self.save_4 = shelve.open("save_4.shelve") - self.save_4_lock = threading.Lock() - self.save_5 = shelve.open("save_5.shelve") - self.save_5_lock = threading.Lock() + def start_async(self): + self.workers = [ + self.worker_factory(worker_id,self) + for worker_id in range(8)] + for worker in self.workers: + worker.start() - print(len(list(self.save_1.keys()))) - print(len(list(self.save_2.keys()))) - print(len(list(self.save_3.keys()))) - print(len(list(self.save_4.keys()))) - print(len(list(self.save_5.keys()))) + def start(self): + self.start_async() + self.join() - def get_url_id(self, url): - return self.id.index(url) - - def save_index(self,word,posting): - cur_save = self.get_save_file(word) - lock = self.get_save_lock(word) - lock.acquire() - shelve_list = list() - try: - shelve_list = cur_save[word] - shelve_list.append(posting) - tic = perf_counter() - # Sort by url id to help with query search - shelve_list.sort(key=lambda x: x.url) - # shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to sort shelve list !") - cur_save.sync() - lock.release() - except: - shelve_list.append(posting) - cur_save[word] = shelve_list - cur_save.sync() - lock.release() - - def get_save_file(self,word): - #return the correct save depending on the starting letter of word - word_lower = word.lower() - - if re.match(r"^[a-d0-1].*",word_lower): - return self.save_1 - elif re.match(r"^[e-k2-3].*",word_lower): - return self.save_2 - elif re.match(r"^[l-q4-7].*",word_lower): - return self.save_3 - elif re.match(r"^[r-z8-9].*",word_lower): - return self.save_4 - else: - print(word) - print("You have somehow went beyond the magic") - return self.save_5 - - def get_save_lock(self,word): - word_lower = word.lower() - if re.match(r"^[a-d0-1].*",word_lower): - return self.save_1_lock - elif re.match(r"^[e-k2-3].*",word_lower): - return self.save_2_lock - elif re.match(r"^[l-q4-7].*",word_lower): - return self.save_3_lock - elif re.match(r"^[r-z8-9].*",word_lower): - return self.save_4_lock - else: - print(word) - print("You have somehow went beyond the magic") - return self.save_5_lock.acquire() - # I have a test file (mytest.py) with pandas but couldn't figure out how to grab just a single cell. - # so I came up with this, if anyone knows how to get a single cell and can explain it to - # me I would love to know, as I think that method might be quicker, maybe, idk it like - # 4am - # https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen - - # removed parameter "word" since it wasn't used - # TODO: Add important words scaling - def get_tf_idf(self, words): - # words = [whole text] one element list - # return the score - try: - tfidf = TfidfVectorizer(ngram_range=(1,1)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams - tfidf_matrix = tfidf.fit_transform(words) # fit trains the model, transform creates matrix - df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram - #return(df.iloc[0][''.join(word)]) #used for finding single word in dataset - data = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run - return data # returns the dict of words/n-grams with tf-idf - #print(df) # debugging - except: - print("Error in tf_idf!") - return -1 - - def tf(self, text, url): - # tf - tokens = {} - split = text.split(" ") - # loop using index to keep track of position - for i in range(len(split)): - if split[i] not in tokens: - tokens[split[i]] = Posting(self.get_url_id(url), 1, i) - else: - tokens[split[i]].rtf += 1 - tokens[split[i]].tf = (1 + math.log(tokens[split[i]].rtf)) - tokens[split[i]].positions.append(i) - return tokens - - # Does the idf part of the tfidf - def tfidf(self, current_save): - for token, postings in current_save.items(): - for p in postings: - p.tfidf = p.tf * math.log(len(self.id)/len(postings)) - self.normalize[p.url] += p.tfidf**2 + def join(self): + for worker in self.workers: + worker.join() - def get_data(self): + def get_postings(self,index): + merged_index_index = open("merged_index.index" ,'r') + merged_index = open("merged_index.full",'r') + merged_index_index.seek(0,0) + json_value = merged_index_index.readline() + data = json.loads(json_value) + index_index = dict(data['index']) + to_seek = index_index[index] + merged_index.seek(to_seek,0) + json_value = merged_index.readline() + data = json.loads(json_value) + return data['postings'] - num_threads = 8 - threads = list() + def set_weight(self): + weight_file = open('docs.weight','w') + jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False) + weight_file.write(jsonStr) + weight_file.close() + def get_weight(self,doc_id): + weight = open('docs.weight','r') + weight.seek(0,0) + json_value = weight.readline() + data = json.loads(json_value) + return data[doc_id] + + def get_data_path(self): for directory in os.listdir(self.path): for file in os.listdir(self.path + "/" + directory + "/"): - #Actual files here - #JSON["url"] = url of crawled page, ignore fragments - #JSON["content"] = actual HTML - #JSON["encoding"] = ENCODING - index = 0 - while True: - file_path = self.path + "" + directory + "/"+file - # Add url to id here so that there isn't any problems when worker is multi-threaded + self.data_paths.append("data/DEV/" + directory + "/"+file) + self.num_doc = len(self.data_paths) - tic = perf_counter() - load = open(file_path) - data = json.load(load) - if data["url"] not in self.id: - self.id.append(data["url"]) - toc = perf_counter() - print("Took " + str(toc - tic) + " seconds to save url to self.id") - - if len(threads) < num_threads: - thread = Worker(self,file_path) - threads.append(thread) - thread.start() - break - else: - if not threads[index].is_alive(): - threads[index] = Worker(self,file_path) - threads[index].start() - break - else: - index = index + 1 - if(index >= num_threads): - index = 0 - time.sleep(.1) - # Make a list the size of the corpus to keep track of document scores - self.normalize = [0] * len(self.id) + def get_next_file(self): + self.data_paths_lock.acquire() + try: + holder = self.data_paths.pop() + self.data_paths_lock.release() + return holder + except IndexError: + self.data_paths_lock.release() + return None + + def add_partial_index(self,partial_index): + self.list_partials_lock.acquire() + self.list_partials.append(partial_index) + self.list_partials_lock.release() - # These last few function calls calculates idf and finalizes tf-idf weighting for each index - self.tfidf(self.save_1) - self.tfidf(self.save_2) - self.tfidf(self.save_3) - self.tfidf(self.save_4) - self.tfidf(self.save_5) - - # Creates a pickle file that is a list of urls where the index of the url is the id that the posting refers to. - p = os.path.dirname(os.path.abspath(__file__)) - my_filename = os.path.join(p, "urlID.pkl") - if os.path.exists(my_filename): - os.remove(my_filename) - # Creates file and closes it - f = open(my_filename, "wb") - pickle.dump(self.id, f) - f.close() - - # Creates a pickle file that will contain the denominator (before the square root) for normalizing wt - p = os.path.dirname(os.path.abspath(__file__)) - my_filename = os.path.join(p, "normalize.pkl") - if os.path.exists(my_filename): - os.remove(my_filename) - # Creates file and closes it - f = open(my_filename, "wb") - pickle.dump(self.normalize, f) - f.close() #Found 55770 documents # + #getting important tokens - #getting important tokens + def merge(self): + partial_files = list() + partial_index_files = list() + parital_index_indices = list() + + num_indices = len(self.list_partials) + + #Full Index.Index and Length + full_index = Index() + full_index.index = list() + full_index.length = 0 + + for partial_index in self.list_partials: + file = open("temp/" + partial_index+'.partial','r') + partial_files.append(file) + index = open("temp/" + partial_index+'.index','r') + partial_index_files.append(index) + + for partial_index_file in partial_index_files: + partial_index_file.seek(0,0) + parital_index_indices.append(json.loads(partial_index_file.readline())) + + #Start all indexes at 0 + for partial_file in partial_files: + partial_file.seek(0,0) + + pointers = [0]*num_indices + merged_index = open("merged_index.full",'w') + merged_index_index = open("merged_index.index" ,'w') + + while(True): + + #Get all values from all indices to find min + value = None + values = list() + for i in range(num_indices): + if pointers[i] < parital_index_indices[i]['length']: + values.append(parital_index_indices[i]['index'][pointers[i]][0]) - + if(len(values) == 0): + break + value = min(values) + + #Get data from the min value of all indices if exists then save to mergedIndex + if value == None: + print("I have crashed some how by not getting min value") + break + + node = Node() + node.index_value = value + for i in range(num_indices): + if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value: + to_seek = parital_index_indices[i]['index'][pointers[i]][1] + partial_files[i].seek(to_seek,0) + json_value = partial_files[i].readline() + temp_node = json.loads(json_value) + node.postings = node.postings + temp_node['postings'] + pointers[i] = pointers[i] + 1 + #Change postings here with tf*idf idf = log (n/df(t)) + node.postings.sort(key=lambda y:y['doc_id']) + for posting in node.postings: + posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings)) + full_index.index.append((value,merged_index.tell())) + full_index.length = full_index.length + 1 + jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False) + merged_index.write(jsonStr + '\n') + + full_index.index.sort(key=lambda y:y[0]) + jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False) + merged_index_index.write(jsonStr) + + for partial_index in self.list_partials: + os.remove("temp/" + partial_index+'.partial') + os.remove("temp/" + partial_index+'.index') + + merged_index_index.close() + merged_index.close() + + def main(): - indexer = Indexer(True,0) - indexer.get_data() + indexer = Indexer(list(),dict(),list()) + indexer.get_data_path() + print("We have " + str(len(indexer.data_paths)) + " documents to go through !" ) + indexer.start() + indexer.merge() + print("Finished merging into 1 big happy family") + indexer.set_weight() + + tic = time.perf_counter() + indexer.get_postings('artifici') + toc = time.perf_counter() + print(f"Took {toc - tic:0.4f} seconds to get postings for artifici") + tic = time.perf_counter() + indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860') + print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ") + toc = time.perf_counter() + + + if __name__ == "__main__": main() \ No newline at end of file diff --git a/mytest.py b/mytest.py index 3ec2c2e..b9c7d41 100644 --- a/mytest.py +++ b/mytest.py @@ -4,6 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd import numpy as np + #tf_idf #words = whole text #word the word we finding the score for @@ -19,13 +20,12 @@ words = ['this is the first document ' doc1 = ["I can't fucking take it any more. Among Us has singlehandedly ruined my life. The other day my teacher was teaching us Greek Mythology and he mentioned a pegasus and I immediately thought 'Pegasus? more like Mega Sus!!!!' and I've never wanted to kms more. I can't look at a vent without breaking down and fucking crying. I can't eat pasta without thinking 'IMPASTA??? THATS PRETTY SUS!!!!' Skit 4 by Kanye West. The lyrics ruined me. A Mongoose, or the 25th island of greece. The scientific name for pig. I can't fucking take it anymore. Please fucking end my suffering."] doc2 = ["Anyways, um... I bought a whole bunch of shungite rocks, do you know what shungite is? Anybody know what shungite is? No, not Suge Knight, I think he's locked up in prison. I'm talkin' shungite. Anyways, it's a two billion year-old like, rock stone that protects against frequencies and unwanted frequencies that may be traveling in the air. That's my story, I bought a whole bunch of stuff. Put 'em around the la casa. Little pyramids, stuff like that."] word = 'life' - try: - tfidf = TfidfVectorizer() - tfidf_matrix = tfidf.fit_transform(doc1) + tfidf = TfidfVectorizer(ngram_range=(3,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams + tfidf_matrix = tfidf.fit_transform(words) df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) - print(df.iloc[0][''.join(word)]) - #print(df) + #print(df.iloc[0][''.join(word)]) + data = df.to_dict() except KeyError: # word does not exist print(-1) diff --git a/posting.py b/posting.py index cffc0ec..898a5c2 100644 --- a/posting.py +++ b/posting.py @@ -1,9 +1,17 @@ #Posting class for indexer, will probably be more complex as we keep adding crap to it class Posting(): - def __init__(self, url, rtf, position): + def __init__(self,doc_id,url,tf_raw,tf_idf,positionals): + self.doc_id = doc_id self.url = url - self.rtf = rtf - self.tf = 0 - self.tfidf = 0 - self.positions = [position] \ No newline at end of file + self.tf_raw = tf_raw + self.tf_idf = tf_idf + self.positionals = positionals + def __repr__(self): + return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals) + def __str__(self): + return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals) + + def comparator(self): + #Some custom comparator for sorting postings later + pass \ No newline at end of file diff --git a/save_1.shelve.bak b/save_1.shelve.bak deleted file mode 100644 index e69de29..0000000 diff --git a/save_1.shelve.dat b/save_1.shelve.dat deleted file mode 100644 index e69de29..0000000 diff --git a/save_1.shelve.dir b/save_1.shelve.dir deleted file mode 100644 index e69de29..0000000 diff --git a/save_2.shelve.bak b/save_2.shelve.bak deleted file mode 100644 index e69de29..0000000 diff --git a/save_2.shelve.dat b/save_2.shelve.dat deleted file mode 100644 index e69de29..0000000 diff --git a/save_2.shelve.dir b/save_2.shelve.dir deleted file mode 100644 index e69de29..0000000 diff --git a/save_3.shelve.bak b/save_3.shelve.bak deleted file mode 100644 index e69de29..0000000 diff --git a/save_3.shelve.dat b/save_3.shelve.dat deleted file mode 100644 index e69de29..0000000 diff --git a/save_3.shelve.dir b/save_3.shelve.dir deleted file mode 100644 index e69de29..0000000 diff --git a/save_4.shelve.bak b/save_4.shelve.bak deleted file mode 100644 index e69de29..0000000 diff --git a/save_4.shelve.dat b/save_4.shelve.dat deleted file mode 100644 index e69de29..0000000 diff --git a/save_4.shelve.dir b/save_4.shelve.dir deleted file mode 100644 index e69de29..0000000 diff --git a/save_5.shelve.bak b/save_5.shelve.bak deleted file mode 100644 index e69de29..0000000 diff --git a/save_5.shelve.dat b/save_5.shelve.dat deleted file mode 100644 index e69de29..0000000 diff --git a/save_5.shelve.dir b/save_5.shelve.dir deleted file mode 100644 index e69de29..0000000 diff --git a/stemmer.py b/stemmer.py deleted file mode 100644 index f270888..0000000 --- a/stemmer.py +++ /dev/null @@ -1,18 +0,0 @@ -#Multiple implementation of stemming here please -class Stemmer(): - - def __init__(self,mode, data): - #Different type of stemmer = different modes - self.mode = mode - self.data = data - - def stem(self): - #Do stuff here - if(self.mode == 0): - #Do stemmer 1 - return #stemmed data - #.... - - def #name of stemmer 1 - - def #name of stemmer 2 \ No newline at end of file diff --git a/tempCodeRunnerFile.py b/tempCodeRunnerFile.py deleted file mode 100644 index 02987ef..0000000 --- a/tempCodeRunnerFile.py +++ /dev/null @@ -1,2 +0,0 @@ - - for postings in l_posting: \ No newline at end of file diff --git a/test.py b/test.py index 754903b..d184b9a 100644 --- a/test.py +++ b/test.py @@ -1,17 +1,13 @@ -import re +from threading import Thread +import json import os +import shelve +import sys +from bs4 import BeautifulSoup +from time import perf_counter +from nltk.stem import PorterStemmer +import nltk +import time +from posting import Posting -for i in range(99): - word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1) - print(word_lower) - if re.match(r"^[a-d1-1].*",word_lower): - print("SAVE 1") - elif re.match(r"^[e-k2-3].*",word_lower): - print("SAVE 2") - elif re.match(r"^[l-q4-7].*",word_lower): - print("SAVE 3") - elif re.match(r"^[r-z8-9].*",word_lower): - print("SAVE 4") - -path = "data/DEV/" -print(os.listdir(path)) \ No newline at end of file +import re diff --git a/test1.py b/test1.py deleted file mode 100644 index 85c4eb5..0000000 --- a/test1.py +++ /dev/null @@ -1,28 +0,0 @@ -import json -import os -import shelve -from bs4 import BeautifulSoup -from time import perf_counter -import time -import threading -import pickle - - -#Data process -from nltk.tokenize import word_tokenize -from nltk.stem import PorterStemmer -from sklearn.feature_extraction.text import TfidfVectorizer -import pandas as pd -import numpy as np -from porter2stemmer import Porter2Stemmer - -import re - -save_1 = shelve.open("save_1.shelve") -save_2 = shelve.open("save_2.shelve") -save_3 = shelve.open("save_3.shelve") -save_4 = shelve.open("save_4.shelve") -save_5 = shelve.open("save_5.shelve") - -key = list(save_1.keys()) -print(key) \ No newline at end of file diff --git a/test_merge.py b/test_merge.py new file mode 100644 index 0000000..14b9dd1 --- /dev/null +++ b/test_merge.py @@ -0,0 +1,116 @@ +import json +from posting import Posting +import math +import sys +import random +from nltk.corpus import words +random_list = [1,2,3,4,5,6,7,8,9,10] + + +test_data = words.words() +random.shuffle(test_data) + + +def random_posting(id): + return Posting(id,random.choice(random_list),random.choice(random_list),[random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list), + random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list)]) + +class Node(): + index_value = 'Something' + postings = list() + +class Index(): + length = 0 + index = list() + +def random_partial_index(name): + part_index = Index() + part_index.index = list() + part_index.length = 0 + with open(name +'.partial', 'w') as f: + for i in range(1000): + + node1 = Node() + node1.index_value = random.choice(test_data).lower() + node1.postings = list() + for i in range(10): + node1.postings.append(random_posting(i)) + + jsonStr = json.dumps(node1, default=lambda o: o.__dict__,sort_keys=False) + + part_index.index.append((node1.index_value,f.tell())) + f.write(jsonStr + '\n') + part_index.length = part_index.length + 1 + + part_index.index.sort(key=lambda y:y[0]) + jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False) + with open(name + '.index','w') as f: + f.write(jsonStr) + +def merge(partial_indices): + partial_files = list() + partial_index_files = list() + parital_index_indices = list() + merged_index = open("merged_index.full",'w') + num_indices = len(partial_indices) + + #Full Index.Index and Length + full_index = Index() + full_index.index = list() + full_index.length = 0 + + for partial_index in partial_indices: + file = open(partial_index+'.partial','r') + partial_files.append(file) + index = open(partial_index+'.index','r') + partial_index_files.append(index) + + for partial_index_file in partial_index_files: + partial_index_file.seek(0,0) + parital_index_indices.append(json.loads(partial_index_file.readline())) + + #Start all indexes at 0 + for partial_file in partial_files: + partial_file.seek(0,0) + + pointers = [0]*num_indices + + while(True): + + #Get all values from all indices to find min + value = None + values = list() + for i in range(num_indices): + if pointers[i] < parital_index_indices[i]['length']: + values.append(parital_index_indices[i]['index'][pointers[i]][0]) + + if(len(values) == 0): + break + value = min(values) + + #Get data from the min value of all indices if exists then save to mergedIndex + if value == None: + print("I have crashed some how by not getting min value") + break + + node = Node() + node.index_value = value + for i in range(num_indices): + if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value: + to_seek = parital_index_indices[i]['index'][pointers[i]][1] + partial_files[i].seek(to_seek,0) + json_value = partial_files[i].readline() + temp_node = json.loads(json_value) + node.postings = node.postings + temp_node['postings'] + pointers[i] = pointers[i] + 1 + + node.postings.sort(key=lambda y:y['doc_id']) + full_index.index.append((value,merged_index.tell())) + full_index.length = full_index.length + 1 + jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False) + merged_index.write(jsonStr + '\n') + + full_index.index.sort(key=lambda y:y[0]) + jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False) + with open("merged_index.index" ,'w') as f: + f.write(jsonStr) diff --git a/urlID.pkl b/urlID.pkl deleted file mode 100644 index eff0dff..0000000 Binary files a/urlID.pkl and /dev/null differ diff --git a/worker.py b/worker.py index fe37356..14881d9 100644 --- a/worker.py +++ b/worker.py @@ -1,64 +1,137 @@ from threading import Thread import json import os -import shelve -from bs4 import BeautifulSoup -from time import perf_counter -import time -import pickle +from bs4 import BeautifulSoup import re #Data process from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer -from sklearn.feature_extraction.text import TfidfVectorizer -import pandas as pd -import numpy as np -from collections import Counter from posting import Posting +import math import sys +class Node(): + index_value = '' + postings = list() + +class Index(): + length = 0 + index = list() + class Worker(Thread): - def __init__(self,indexer,target): - self.file = target + def __init__(self,worker_id,indexer): self.indexer = indexer + self.stemmer = PorterStemmer() + self.worker_id = worker_id + self.num_partial = 0 + self.index = dict() super().__init__(daemon=True) - def run(self): - print("Target: " + str(self.file)) - ticker = perf_counter() - file_load = open(self.file) - data = json.load(file_load) - soup = BeautifulSoup(data["content"],features="lxml") - # Gets a cleaner version text comparative to soup.get_text() - tic = perf_counter() - clean_text = ' '.join(soup.stripped_strings) - # Looks for large white space, tabbed space, and other forms of spacing and removes it - # Regex expression matches for space characters excluding a single space or words - clean_text = re.sub(r'\s[^ \w]', '', clean_text) - # Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended - clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)]) - # Stems tokenized text - clean_text = " ".join([self.indexer.stemmer.stem(i) for i in clean_text.split()]) - # Put clean_text as an element in a list because get_tf_idf workers properly with single element lists - x = [clean_text] - toc = perf_counter() - print("Took " + str(toc - tic) + " seconds to create clean text") - # ngrams is a dict - # structure looks like {ngram : {0: tf-idf score}} - ngrams = self.indexer.get_tf_idf(x) - if ngrams != -1: - tic = perf_counter() - for ngram, posting in ngrams.items(): - self.indexer.save_index(ngram, posting) - toc = perf_counter() - print("Took " + str(toc - tic) + " seconds to save ngram") - - tocker = perf_counter() - print("Took " + str(tocker - ticker) + " seconds to work") + def dump(self): + part_index = Index() + part_index.length = 0 + part_index.index = list() + cur_partial_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.partial' + cur_partial_index_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.index' + + + cur_partial_index = open(cur_partial_index_str,'w') + cur_partial_index_index = open(cur_partial_index_index_str,'w') + + for key in self.index: + node = Node() + node.index_value = key + node.postings = self.index[key] + + jsonStr = json.dumps(node, default=lambda o: o.__dict__,sort_keys=False) + + part_index.index.append((node.index_value,cur_partial_index.tell())) + cur_partial_index.write(jsonStr + '\n') + part_index.length = part_index.length + 1 + + part_index.index.sort(key=lambda y:y[0]) + jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False) + cur_partial_index_index.write(jsonStr) + + self.indexer.add_partial_index(str(self.worker_id) + "_" + str(self.num_partial)) + self.num_partial = self.num_partial + 1 + self.index.clear() + + + def run(self): + while True: + target = self.indexer.get_next_file() + if not target: + self.dump() + print("Worker " + str(self.worker_id) + " died") + break + file_load = open(target) + data = json.load(file_load) + soup = BeautifulSoup(data["content"],features="lxml") + doc_id = target[target.rfind('/')+1:-5] + url = data['url'] + print("Worker " + str(self.worker_id) + " working on " + url) + important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []} + for key_words in important.keys(): + for i in soup.findAll(key_words): + for word in word_tokenize(i.text): + important[key_words].append(self.stemmer.stem(word)) + + # Gets a cleaner version text comparative to soup.get_text() + clean_text = ' '.join(soup.stripped_strings) + # Looks for large white space, tabbed space, and other forms of spacing and removes it + # Regex expression matches for space characters excluding a single space or words + clean_text = re.sub(r'\s[^ \w]', '', clean_text) + # Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended + clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)]) + # Stems tokenized text + clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()]) + # Put clean_text as an element in a list because get_tf_idf workers properly with single element lists + + tokens = word_tokenize(clean_text) + + #counter(count,positionals) + + counter = dict() + #We calculating tf_raw, and positionals here + for i in range(len(tokens)): + word = tokens[i] + if word in counter: + counter[word][0] = counter[word][0] + 1 + counter[word][1].append(i) + else: + counter[word] = [1,list()] + counter[word][1].append(i) + + doc_length = len(tokens) + total = 0 + for index in counter: + tf = counter[index][0]/doc_length + log_tf = 1 + math.log(tf) + total = total + log_tf * log_tf + if index in self.index: + postings = self.index[index] + postings.append(Posting(doc_id,url,tf,0,counter[index][1])) + else: + self.index[index] = list() + self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1])) + self.index[index].sort(key=lambda y:y.doc_id) + + self.indexer.weight[doc_id] = math.sqrt(total) + + #10 Megabytes index (in Ram approx) + if sys.getsizeof(self.index) > 1000000: + self.dump() + + + + + +