Commit 68aef175 authored by sim's avatar sim

Brand new scrapers infrastructure using scrapy

parent b68c1ae0
......@@ -26,6 +26,9 @@ djangorestframework-jwt = "*"
django-celery-beat = "*"
python-decouple = "*"
alembic = "*"
scrapy = "*"
jmespath = "*"
risparser = "*"
[requires]
......
{
"_meta": {
"hash": {
"sha256": "c8907533f349d8390d9825628006b0b1baad612cde3dc5633df87706a803567c"
"sha256": "d94567674a7b0441d3a9ba14b73201e335c3511ee2dd75306138b635dc1eedc7"
},
"pipfile-spec": 6,
"requires": {
......@@ -36,6 +36,27 @@
],
"version": "==1.4.0"
},
"asn1crypto": {
"hashes": [
"sha256:2f1adbb7546ed199e3c90ef23ec95c5cf3585bac7d11fb7eb562a3fe89c64e87",
"sha256:9d5c20441baf0cb60a4ac34cc447c6c189024b6b4c6cd7877034f4965c464e49"
],
"version": "==0.24.0"
},
"attrs": {
"hashes": [
"sha256:1c7960ccfd6a005cd9f7ba884e6316b5e430a3f1a6c37c5f87d8b43f83b54ec9",
"sha256:a17a9573a6f475c99b551c0e0a812707ddda1ec9653bed04c13841404ed6f450"
],
"version": "==17.4.0"
},
"automat": {
"hashes": [
"sha256:2140297df155f7990f6f4c73b2ab0583bd8150db9ed2a1b48122abe66e9908c1",
"sha256:3c1fd04ecf08ac87b4dd3feae409542e9bf7827257097b2b6ed5692f69d6f6a8"
],
"version": "==0.6.0"
},
"billiard": {
"hashes": [
"sha256:1d7b22bdc47aa52841120fcd22a74ae4fc8c13e9d3935643098184f5788c3ce6",
......@@ -50,6 +71,81 @@
],
"version": "==4.1.0"
},
"cffi": {
"hashes": [
"sha256:151b7eefd035c56b2b2e1eb9963c90c6302dc15fbd8c1c0a83a163ff2c7d7743",
"sha256:1553d1e99f035ace1c0544050622b7bc963374a00c467edafac50ad7bd276aef",
"sha256:1b0493c091a1898f1136e3f4f991a784437fac3673780ff9de3bcf46c80b6b50",
"sha256:2ba8a45822b7aee805ab49abfe7eec16b90587f7f26df20c71dd89e45a97076f",
"sha256:3c85641778460581c42924384f5e68076d724ceac0f267d66c757f7535069c93",
"sha256:3eb6434197633b7748cea30bf0ba9f66727cdce45117a712b29a443943733257",
"sha256:4c91af6e967c2015729d3e69c2e51d92f9898c330d6a851bf8f121236f3defd3",
"sha256:770f3782b31f50b68627e22f91cb182c48c47c02eb405fd689472aa7b7aa16dc",
"sha256:79f9b6f7c46ae1f8ded75f68cf8ad50e5729ed4d590c74840471fc2823457d04",
"sha256:7a33145e04d44ce95bcd71e522b478d282ad0eafaf34fe1ec5bbd73e662f22b6",
"sha256:857959354ae3a6fa3da6651b966d13b0a8bed6bbc87a0de7b38a549db1d2a359",
"sha256:87f37fe5130574ff76c17cab61e7d2538a16f843bb7bca8ebbc4b12de3078596",
"sha256:95d5251e4b5ca00061f9d9f3d6fe537247e145a8524ae9fd30a2f8fbce993b5b",
"sha256:9d1d3e63a4afdc29bd76ce6aa9d58c771cd1599fbba8cf5057e7860b203710dd",
"sha256:a36c5c154f9d42ec176e6e620cb0dd275744aa1d804786a71ac37dc3661a5e95",
"sha256:ae5e35a2c189d397b91034642cb0eab0e346f776ec2eb44a49a459e6615d6e2e",
"sha256:b0f7d4a3df8f06cf49f9f121bead236e328074de6449866515cea4907bbc63d6",
"sha256:b75110fb114fa366b29a027d0c9be3709579602ae111ff61674d28c93606acca",
"sha256:ba5e697569f84b13640c9e193170e89c13c6244c24400fc57e88724ef610cd31",
"sha256:be2a9b390f77fd7676d80bc3cdc4f8edb940d8c198ed2d8c0be1319018c778e1",
"sha256:d5d8555d9bfc3f02385c1c37e9f998e2011f0db4f90e250e5bc0c0a85a813085",
"sha256:e55e22ac0a30023426564b1059b035973ec82186ddddbac867078435801c7801",
"sha256:e90f17980e6ab0f3c2f3730e56d1fe9bcba1891eeea58966e89d352492cc74f4",
"sha256:ecbb7b01409e9b782df5ded849c178a0aa7c906cf8c5a67368047daab282b184",
"sha256:ed01918d545a38998bfa5902c7c00e0fee90e957ce036a4000a88e3fe2264917",
"sha256:edabd457cd23a02965166026fd9bfd196f4324fe6032e866d0f3bd0301cd486f",
"sha256:fdf1c1dc5bafc32bc5d08b054f94d659422b05aba244d6be4ddc1c72d9aa70fb"
],
"markers": "platform_python_implementation != 'PyPy'",
"version": "==1.11.5"
},
"constantly": {
"hashes": [
"sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35",
"sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d"
],
"version": "==15.1.0"
},
"cryptography": {
"hashes": [
"sha256:0d39a93cf25edeae1f796bbc5960e587f34513a852564f6345ea4491a86c5997",
"sha256:18d0b0fc21f39b35ea469a82584f55eeecec1f65a92d85af712c425bdef627b3",
"sha256:27a208b9600166976182351174948e128818e7fc95cbdba18143f3106a211546",
"sha256:28e4e9a97713aa47b5ef9c5003def2eb58aec89781ef3ef82b1c2916a8b0639b",
"sha256:2cfcee8829c5dec55597826d52c26bc26e7ce39adb4771584459d0636b0b7108",
"sha256:33b564196dcd563e309a0b07444e31611368afe3a3822160c046f5e4c3b5cdd7",
"sha256:41f94194ae78f83fd94ca94fb8ad65f92210a76a2421169ffa5c33c3ec7605f4",
"sha256:4f385ee7d39ee1ed74f1d6b1da03d0734ea82855a7b28a9e6e88c4091bc58664",
"sha256:55555d784cfdf9033e81f044c0df04babed2aa141213765d960d233b0139e353",
"sha256:69285f5615507b6625f89ea1048addd1d9218585fb886eb90bdebb1d2b2d26f5",
"sha256:6cb1224da391fa90f1be524eafb375b62baf8d3df9690b32e8cc475ccfccee5e",
"sha256:6fb22f63e17813f3d1d8e30dd1e249e2c34233ba1d3de977fd31cb5db764c7d0",
"sha256:7a2409f1564c84bcf2563d379c9b6148c5bc6b0ae46e109f6a7b4bebadf551df",
"sha256:8487524a1212223ca6dc7e2c8913024618f7ff29855c98869088e3818d5f6733",
"sha256:9a2945efcff84830c8e237ab037d0269380d75d400a89cc9e5628e52647e21be",
"sha256:9a47a80f65f4feaaf8415a40c339806c7d7d867152ddccc6ca87f707c8b7b565",
"sha256:a3c180d12ffb1d8ee5b33a514a5bcb2a9cc06cc89aa74038015591170c82f55d",
"sha256:a5f2c681fd040ed648513939a1dd2242af19bd5e9e79e53b6dcfa33bdae61217",
"sha256:b984523d28737e373c7c35c8b6db6001537609d47534892de189bebebaa42a47",
"sha256:d18df9cf3f3212df28d445ea82ce702c4d7a35817ef7a38ee38879ffa8f7e857",
"sha256:e4d967371c5b6b2e67855066471d844c5d52d210c36c28d49a8507b96e2c5291",
"sha256:ee245f185fae723133511e2450be08a66c2eebb53ad27c0c19b228029f4748a5",
"sha256:fc2208d95d9ecc8032f5e38330d5ace2e3b0b998e42b08c30c35b2ab3a4a3bc8"
],
"version": "==2.1.4"
},
"cssselect": {
"hashes": [
"sha256:066d8bc5229af09617e24b3ca4d52f1f9092d9e061931f4184cd572885c23204",
"sha256:3b5103e8789da9e936a68d993b70df732d06b8bb9a337a05ed4eb52c17ef7206"
],
"version": "==1.0.3"
},
"dateutils": {
"hashes": [
"sha256:c94a8e77d743abac79ed91f99f5ef594a972a527e05145cbb7aba59beced8a71"
......@@ -84,6 +180,34 @@
],
"version": "==1.11.0"
},
"hyperlink": {
"hashes": [
"sha256:98da4218a56b448c7ec7d2655cb339af1f7d751cf541469bb4fc28c4a4245b34",
"sha256:f01b4ff744f14bc5d0a22a6b9f1525ab7d6312cb0ff967f59414bbac52f0a306"
],
"version": "==18.0.0"
},
"idna": {
"hashes": [
"sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
"sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
],
"version": "==2.6"
},
"incremental": {
"hashes": [
"sha256:717e12246dddf231a349175f48d74d93e2897244939173b01974ab6661406b9f",
"sha256:7b751696aaf36eebfab537e458929e194460051ccad279c72b755a167eebd4b3"
],
"version": "==17.5.0"
},
"jmespath": {
"hashes": [
"sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64",
"sha256:f11b4461f425740a1d908e9a3f7365c3d2e569f6ca68a2ff8bc5bcd9676edd63"
],
"version": "==0.9.3"
},
"kombu": {
"hashes": [
"sha256:01f0da9fe222a2183345004243d1518c0fbe5875955f1b24842f2d9c65709ade",
......@@ -91,6 +215,39 @@
],
"version": "==4.1.0"
},
"lxml": {
"hashes": [
"sha256:0ee07da52d240f1dc3c83eef5cd5f1b7f018226c1121f2a54d446645779a6d17",
"sha256:155521c337acecf8202091cff85bb9f709f238130ebadf04280fb1db11f5ad8b",
"sha256:155c916cf2645b4a8f2bd5d09065e92d1b67b8d464bdc001e0b524af84bedf6f",
"sha256:2190266059fec3c5a55f9d6c30532c64c6d414d3228909c0af573fe4907e78d1",
"sha256:29a36e354c39b2e24bc4ee103de53417ebb80f976a6ab9e8d093d559e2ac03e1",
"sha256:2dedfeeecc2d5a939cf622602f5a1ce443ca82407f386880f739f1a9f08053ad",
"sha256:3b33549fb8f91b38a7500078242b03cca513f3412a2cdae722e89bf83f95971d",
"sha256:4187c4b0cefc3353181db048c51f42c489d9ac51e40b86c4851dc0671372971d",
"sha256:41f59cbdab232f11680d5d4dec9f2e6782fd24d78e37ee833447702e34e675f4",
"sha256:470d7ce41e8047208ba1a376560bad17f1468df1f3097bc83902b26cfafdbb0c",
"sha256:49a655956f8de69e1258bc0fcfc43eb3bd1e038655784d77d1869b4b81444e37",
"sha256:4c12e90886d9c53ab434c8d0cebea122321cce19614c3c6b6d1a7700d7cc6212",
"sha256:6cba398eb37e0631e60e0e080c101cfe91769b2c8267105b64b4625e2581ea21",
"sha256:79322000279cda10b53c374d53ca632ead3bc51c6aebf8e62c8fa93a4d08b750",
"sha256:87a66bcadac270fc010cb029022a93fc722bf1204a8b03e782d4c790f0edf7ca",
"sha256:940caef1ec7c78e0c34b0f6b94fe42d0f2022915ffc78643d28538a5cfd0f40e",
"sha256:950e63387514aa1b881eba5ac6cb2ec51a118b3dafe99dd80ca19d8fb0142f30",
"sha256:af8a5373241d09b8fc53e0490e1719ce5dc90a21b19db89b6596c1adcdd52270",
"sha256:b106d4d2383382399ad82108fd187e92f40b1c90f55c2d36bbcb1c44bcf940fc",
"sha256:ba05732e4bcf59e948f61588851dcf620fd60d5bbd9d704203e5f59bbaa60219",
"sha256:d2c985d2460b81c6ca5feb8b86f1bc594ad59405d0bdf68626b85852b701553c",
"sha256:d5d29663e979e83b3fc361e97200f959cddb3a14797391d15273d84a5a8ae44b",
"sha256:dd291debfaa535d9cb6cee8d7aca2328775e037d02d13f1634e57f49bc302cc4",
"sha256:e37427d5a27eefbcfc48847e0b37f348113fac7280bc857421db39ffc6372570",
"sha256:e608839a5ee2180164424ccf279c8e2d9bbe8816d002c58fd97d6b621ba4aa94",
"sha256:e6b6698415c7e8d227a47a3b1038e1b37c2b438a1b48c2db7ad9e74ddbcd1149",
"sha256:e7e41d383f19bab9d57f5f3b18d158655bcd682e7e723f441b9e183e1e35a6b5",
"sha256:fa7320679ced5e25b20203d157280680fc84eb783b6cc650cb0c98e1858b7dd3"
],
"version": "==4.1.1"
},
"mako": {
"hashes": [
"sha256:4e02fde57bd4abb5ec400181e4c314f56ac3e49ba4fb8b0d50bba18cb27d25ae"
......@@ -103,6 +260,13 @@
],
"version": "==1.0"
},
"parsel": {
"hashes": [
"sha256:1a9ac0c1db8175547e1732be57ced2a2dc0714590f6b249d022ad25d918ef923",
"sha256:2f3a6813a0ff39b6ca2530b9c1ad25d83e3a33808d93dd21fbf114c6232a16a8"
],
"version": "==1.4.0"
},
"psycopg2-binary": {
"hashes": [
"sha256:02eb674e3d5810e19b4d5d00720b17130e182da1ba259dda608aaf33d787347d",
......@@ -135,6 +299,47 @@
],
"version": "==2.7.4"
},
"pyasn1": {
"hashes": [
"sha256:0d7f6e959fe53f3960a23d73f35e1fce61348b30915b6664309ca756de7c1f89",
"sha256:5a0db897b311d265cde49615cf783f1c78613138605cdd0f907ecfa5b2aba3ee",
"sha256:758cb50abddc03e4563fd9e7f03db56e3e87b58c0bd01247360326e5c0c7ffa5",
"sha256:7d626683e3d792cccc608da02498aff37ab4f3dafd8905d6bf755d11f9b26b43",
"sha256:a7efe807c4b83a859e2735c692b92ed7b567cfddc4163763412920041d876c2b",
"sha256:b5a9ca48055b9a20f6d1b3d68e38692e5431c86a0f99ea602e61294e891fee5b",
"sha256:c07d6e587b2f928366b1f67c09bda026a3e6fcc99e80a744dc67f8fca3895626",
"sha256:d258b0a71994f7770599835249cece1caef3c70def868c4915e6e5ca49b67d15",
"sha256:d5cd6ed995dba16fad0c521cfe31cd2d68400b53fcc2bce93326829be73ab6d1",
"sha256:d84c2aea3cf43780e9e6a19f4e4dddee9f6976519020e64e47c57e5c7a8c3dd2",
"sha256:e85895087905c65b5b594eb91f7522664c85545b147d5f4d4e7b1b07da8dcbdc",
"sha256:f81c96761fca60d64b1c9b79ec2e40cf9495a745cf570613079ef324aeb9672b"
],
"version": "==0.4.2"
},
"pyasn1-modules": {
"hashes": [
"sha256:041e9fbafac548d095f5b6c3b328b80792f006196e15a232b731a83c93d59493",
"sha256:0cdca76a68dcb701fff58c397de0ef9922b472b1cb3ea9695ca19d03f1869787",
"sha256:0cea139045c38f84abaa803bcb4b5e8775ea12a42af10019d942f227acc426c3",
"sha256:0f2e50d20bc670be170966638fa0ae603f0bc9ed6ebe8e97a6d1d4cef30cc889",
"sha256:47fb6757ab78fe966e7c58b2030b546854f78416d653163f0ce9290cf2278e8b",
"sha256:598a6004ec26a8ab40a39ea955068cf2a3949ad9c0030da970f2e1ca4c9f1cc9",
"sha256:72fd8b0c11191da088147c6e4678ec53e573923ecf60b57eeac9e97433e09fc2",
"sha256:854700bbdd01394e2ada9c1bfbd0ed9f5d0c551350dbbd023e88b11d2771ae06",
"sha256:af00ea8f2022b6287dc375b2c70f31ab5af83989fc6fe9eacd4976ce26cd7ccc",
"sha256:b1f395cae2d669e0830cb023aa86f9f283b7a9aa32317d7f80d8e78aa2745812",
"sha256:c6747146e95d2b14cc2a8399b2b0bde3f93778f8f9ec704690d2b589c376c137",
"sha256:f53fe5bcebdf318f51399b250fe8325ef3a26d927f012cc0c8e0f9e9af7f9deb"
],
"version": "==0.2.1"
},
"pydispatcher": {
"hashes": [
"sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf",
"sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433"
],
"version": "==2.0.5"
},
"pyjwt": {
"hashes": [
"sha256:9c3016e4a292151c5396e25cc0c28c4e1cdf13fa19118eb84f500f9670e3f628",
......@@ -142,12 +347,19 @@
],
"version": "==1.6.0"
},
"pyopenssl": {
"hashes": [
"sha256:07a2de1a54de07448732a81e38a55df7da109b2f47f599f8bb35b0cbec69d4bd",
"sha256:2c10cfba46a52c0b0950118981d61e72c1e5b1aac451ca1bc77de1a679456773"
],
"version": "==17.5.0"
},
"python-dateutil": {
"hashes": [
"sha256:891c38b2a02f5bb1be3e4793866c8df49c7d19baabf9c1bad62547e0b4866aca",
"sha256:95511bae634d69bc7329ba55e646499a842bc4ec342ad54a8cdb65645a0aad3c"
"sha256:07009062406cffd554a9b4135cd2ff167c9bf6b7aac61fe946c93e69fad1bbd8",
"sha256:8f95bb7e6edbb2456a51a1fb58c8dca942024b4f5844cae62c90aa88afe6e300"
],
"version": "==2.6.1"
"version": "==2.7.0"
},
"python-decouple": {
"hashes": [
......@@ -175,6 +387,34 @@
],
"version": "==2018.3"
},
"queuelib": {
"hashes": [
"sha256:42b413295551bdc24ed9376c1a2cd7d0b1b0fa4746b77b27ca2b797a276a1a17",
"sha256:ff43b5b74b9266f8df4232a8f768dc4d67281a271905e2ed4a3689d4d304cd02"
],
"version": "==1.5.0"
},
"risparser": {
"hashes": [
"sha256:8e1c9ca67a6a90d6a876f5f886f67bab1660a951a2c0e78ed737bdccc2a05c1e",
"sha256:ff99cbbde93cb017236234363db6e17a0615b861313d0ea5ce04e6067848850f"
],
"version": "==0.4.2"
},
"scrapy": {
"hashes": [
"sha256:08d86737c560dcc1c4b73ac0ac5bd8d14b3e2265c1f7b195f0b73ab13741fe03",
"sha256:31a0bf05d43198afaf3acfb9b4fb0c09c1d7d7ff641e58c66e36117f26c4b755"
],
"version": "==1.5.0"
},
"service-identity": {
"hashes": [
"sha256:0e76f3c042cc0f5c7e6da002cf646f59dc4023962d1d1166343ce53bdad39e17",
"sha256:4001fbb3da19e0df22c47a06d29681a398473af4aa9d745eca525b3b2c2302ab"
],
"version": "==17.0.0"
},
"six": {
"hashes": [
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
......@@ -194,12 +434,55 @@
],
"version": "==0.33.0"
},
"twisted": {
"hashes": [
"sha256:0da1a7e35d5fcae37bc9c7978970b5feb3bc82822155b8654ec63925c05af75c",
"sha256:716805e624f9396fcc1f47e8aef68e629fd31599a74855b6e1636122c042458d",
"sha256:7bc3cdfd1ca5e5b84c7936db3c2cb2feb7d5b77410e713fd346da095a3b6a1d2"
],
"version": "==17.9.0"
},
"vine": {
"hashes": [
"sha256:52116d59bc45392af9fdd3b75ed98ae48a93e822cee21e5fda249105c59a7a72",
"sha256:6849544be74ec3638e84d90bc1cf2e1e9224cc10d96cd4383ec3f69e9bce077b"
],
"version": "==1.1.4"
},
"w3lib": {
"hashes": [
"sha256:55994787e93b411c2d659068b51b9998d9d0c05e0df188e6daf8f45836e1ea38",
"sha256:aaf7362464532b1036ab0092e2eee78e8fd7b56787baa9ed4967457b083d011b"
],
"version": "==1.19.0"
},
"zope.interface": {
"hashes": [
"sha256:11b068fc9916556f3820f38c2376c28d8e55e4a2c51c34915aaac38b75706d2e",
"sha256:16fe824b3d93ee0629aa1f04848a1b515d6b5dc9e98cc7a04feaa35fdb0de5f1",
"sha256:1d954d557b63124a65f2247ac6ed66fa36df18d1e8538d08c9b432e808a634de",
"sha256:3d033abd27cd54157cf42a3bfd4d8c28d7fc5c6f775df3332307d2632a79925b",
"sha256:4be05f79e952793f31a0c2d6a0672c81a3300315da587ce6a590357595217005",
"sha256:4cb1c56b0356da9a33249ef77a688c47107f54191c12a0055d284b6bee7f447e",
"sha256:5a8cc535f4212b134e66a3e1c6b93b19d453dbad0e2f89d0df2c01deefc8cad9",
"sha256:5d8813e438ab67a793b09e1223742b757dd95a4a64d466855a53cb113cc9c9c4",
"sha256:78321a6c0c8cc6ac928e44ef04d50384bc864a7f5e3c25b84110da2ede83739f",
"sha256:88e3d54e88a601f45d03e2a062d5d16852d20e0863a92c19260ae72e2586378a",
"sha256:8dfdc1588db31895f81bcba6c36dc981b4cf4a526c62eae3745bbfbe102477ef",
"sha256:9902d5fc11309e17cdce6574243dc114b9c30de5c60ab53c90f6e3e962688565",
"sha256:a16a3e07511fb6806bb48c8c661d38cdb91cd4bc6c2b6b0b173e72362ec1ceb4",
"sha256:a21d69de2ee89fc59de93e7a43c0379ecedb5149739ff94e910c2bf0ca18e181",
"sha256:a6375035a4b45d199a8b990e3a2f6b71906c318c56dfc14b2d58350b6ca59392",
"sha256:aef398a5b92e70b8152d2c4850bad0fe185adb50d948f32d0bba5694d82b67c7",
"sha256:b8f3491c9df4f0ffed32b275033e74041f420e5dcdefa4b1500d753c64ef42cf",
"sha256:bd626cd76b7e5cbecac9d3e0dd8f98e3eada15ead95713238a523f877327633d",
"sha256:d6d26d5dfbfd60c65152938fcb82f949e8dada37c041f72916fef6621ba5c5ce",
"sha256:dec19181cf6af58ccb8ba3fa3ca9d4ec555b2f3cb31f589f6e86d15df0926c31",
"sha256:f47d4138405eb67e5f059b9ab74e0a1147adc3277f5fe37d5bae5209b67e89e7",
"sha256:f6868378fffbb8651f1f8a767d17e42aed39926c8f6bb9c56f184022fe6c2090",
"sha256:ff20038fbc0e7ea050a7e28fcb8ae6ed8378a8d08ac70b848ea39960dda86bbf"
],
"version": "==4.4.3"
}
},
"develop": {
......
import logging
from pathlib import Path
from datetime import datetime
from urllib.parse import urlencode
from scrapy.spiders import Spider
from scrapy.signals import response_received
from scrapy.http.request import Request as BaseRequest
from .responses import TextResponse, HtmlResponse, XmlResponse, JsonResponse, \
RISResponse
__all__ = ['Scraper', 'Request', 'TextResponse', 'HtmlResponse', 'XmlResponse',
'JsonResponse', 'RISResponse']
class Request(BaseRequest):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None, params=None):
if params:
url += '?' + urlencode(params)
super().__init__(url, callback, method, headers, body, cookies, meta,
encoding, priority, dont_filter, errback, flags)
class Scraper(Spider):
MAX_COUNT = None
BATCH_SIZE = 100
DEBUG_DIR = '/tmp'
ARGUMENTS = ['url', 'count', 'query', 'count_only']
url = None
count = None
query = ''
count_only = False
def __init__(self, *args, **kwargs):
# The default __init__ method will take any spider arguments and copy
# them to the spider as attributes: filter arguments for security
# purposes.
spider_args = {k: v for k, v in kwargs.items() if k in self.ARGUMENTS}
super().__init__(*args, **spider_args)
default_parser = getattr(self, 'default_parser', None)
if default_parser and not hasattr(self, 'parse'):
# XXX Use setattr to bypass pylint warning...
setattr(self, 'parser', getattr(self, default_parser))
def start_requests(self):
if self.url: # and self.url.startswith('file://'):
yield Request(self.url)
else:
yield from self.dispatch()
@property
def logger_name(self):
return 'scrapers.%s' % self.name
@property
def logger(self):
logger = logging.getLogger(self.logger_name)
return logging.LoggerAdapter(logger, {'spider': self})
@property
def limit(self):
if self.MAX_COUNT is None:
return self.count or 0
if self.count is None:
return self.MAX_COUNT
return min(self.count, self.MAX_COUNT)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.trace, signal=response_received)
return spider
def trace(self, response, request, spider):
content_type = response.headers.get('content-type', b'').decode()
self.logger.info('Content-Type=%s; type(response)=%s;',
content_type, type(response).__name__)
path = Path(self.DEBUG_DIR).absolute()
date = datetime.now().strftime("%Y%m%d_%H%m_%s.%f")
ext = '.html' if isinstance(response, HtmlResponse) else \
'.xml' if isinstance(response, XmlResponse) else \
'.json' if isinstance(response, JsonResponse) else \
'.txt' if isinstance(response, TextResponse) else \
''
filename = '%s-%s%s' % (spider.logger_name, date, ext)
filepath = str(path / filename)
with open(filepath, 'wb') as f:
f.write(response.body)
class ExpectsMiddleware(object):
def process_response(self, request, response, spider):
expects = getattr(spider, 'expects', None)
if expects is not None and not isinstance(response, expects):
expected = ' or '.join(cls.__name__ for cls in expects) \
if type(expects) is tuple else expects.__name__
raise TypeError("%s: %s expected, got %s instead." % (
spider.name, expected, response.__class__.__name__))
return response
from w3lib.url import file_uri_to_path
from scrapy.utils.decorators import defers
from .responses import responsetypes, TextResponse
class FileDownloadHandler(object):
CHUNK_SIZE = 5000
def __init__(self, settings):
pass
@defers
def download_request(self, request, spider):
filepath = file_uri_to_path(request.url)
with open(filepath, 'rb') as fo:
body_chunk = fo.read(self.CHUNK_SIZE)
# Detect response type only from data, don't trust filename extension
respcls = getattr(spider, 'expects', None) or \
responsetypes.from_args(body=body_chunk)
stream = open(filepath) if issubclass(respcls, TextResponse) else \
open(filepath, 'rb')
return respcls(url=request.url, stream=stream)
from datetime import datetime
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Compose, MapCompose, Identity
from .processors import filter_empty
DateTime = Field(serialize=str)
String = Field()
class Document(Item):
id = String
title = String
abstract = String
source = String
url = String
lang = String
authors = String
publication = DateTime
creation = DateTime
class DocumentLoader(ItemLoader):
default_item_class = Document
default_output_processor = TakeFirst()
to_datetime = Compose(MapCompose(str.strip, int), filter_empty, lambda args: datetime(*args))
publication_out = to_datetime
creation_out = to_datetime
authors_out = Identity()
def __init__(self, selector, *args, **kwargs):
kwargs['selector'] = selector
super().__init__(*args, **kwargs)
def add_xpaths_text(self, xpaths):
for field_name, xpath in xpaths.items():
self.add_xpath(field_name, '%s/text()' % xpath)
def add_values(self, values):
for field_name, value in values.items():
self.add_value(field_name, value)
def parse(self, obj):
return NotImplementedError("don't use DocumentLoader directly.")
def load(self):
self.parse(self.selector)
return self.load_item()
__all__ = ['filter_empty']
def filter_empty(iterable):
return list(filter(None, iterable))
import logging
import jmespath
from abc import ABC
from weakref import WeakValueDictionary
from scrapy.http import \
TextResponse as BaseText, HtmlResponse as BaseHtml, XmlResponse as BaseXml
from scrapy import responsetypes as _responsetypes
from scrapy.responsetypes import ResponseTypes as BaseResponseTypes
from gargantext.utils.json import json_loads
from RISparser.parser import Ris
from RISparser.config import TAG_KEY_MAPPING
logger = logging.getLogger('scrapers')
# To be used in conjunction with gargantext.datasource.file.FileDownloadHandler
class StreamableMixin(object):
def __init__(self, *args, **kwargs):
self.stream = kwargs.pop('stream', None)
self._cached_stream_data = None
super().__init__(*args, **kwargs)
def readlines(self):
return iter(self.stream)
def _get_body(self):
if self.stream is not None:
if self._cached_stream_data is None:
self._cached_stream_data = self.stream.read()
return self._cached_stream_data
return super()._get_body()
class TextResponse(StreamableMixin, BaseText):
pass
class HtmlResponse(StreamableMixin, BaseHtml, ABC): pass
HtmlResponse.register(TextResponse)
class XmlResponse(StreamableMixin, BaseXml, ABC): pass
XmlResponse.register(TextResponse)
class ParseableResponse(TextResponse):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._cached_data = None
self._jmes_cache = WeakValueDictionary()
def parse(self):
raise NotImplementedError("don't use ParseableResponse directly")
@property
def data(self):
if self._cached_data is None:
self._cached_data = self.parse()
return self._cached_data
def jmes(self, path):
jp = self._jmes_cache.get(path)
if jp is None:
jp = self._jmes_cache[path] = jmespath.compile(path)
return jp.search(self.data)
class JsonResponse(ParseableResponse):
def parse(self):
return json_loads(self.text)
class RISResponse(ParseableResponse):
class RIS(Ris):
PATTERN = '^[A-Z][A-Z0-9] -'
def __init__(self, lines):
super().__init__(lines, TAG_KEY_MAPPING)
def parse(self):
return self.RIS(self.readlines()).parse()
class ResponseTypes(BaseResponseTypes):
CLASSES = {
'text/html': 'gargantext.datasource.responses.HtmlResponse',
'application/atom+xml': 'gargantext.datasource.responses.XmlResponse',
'application/rdf+xml': 'gargantext.datasource.responses.XmlResponse',
'application/rss+xml': 'gargantext.datasource.responses.XmlResponse',
'application/xhtml+xml': 'gargantext.datasource.responses.HtmlResponse',
'application/vnd.wap.xhtml+xml': 'gargantext.datasource.responses.HtmlResponse',
'application/xml': 'gargantext.datasource.responses.XmlResponse',
'application/json': 'gargantext.datasource.responses.JsonResponse',
'application/x-json': 'gargantext.datasource.responses.JsonResponse',
'application/openapi+json': 'gargantext.datasource.responses.JsonResponse',
'application/json-amazonui-streaming': 'gargantext.datasource.responses.TextResponse',
'application/javascript': 'gargantext.datasource.responses.TextResponse',
'application/x-javascript': 'gargantext.datasource.responses.TextResponse',
'text/xml': 'gargantext.datasource.responses.XmlResponse',
'text/*': 'gargantext.datasource.responses.TextResponse',
}
_responsetypes.responsetypes = responsetypes = ResponseTypes()
......@@ -264,3 +264,21 @@ API_TOKENS = {
# BOOL Interpreter
BOOL_TOOLS_PATH = "gargantext/util/crawlers/sparql"
# Scrapy settings
BOT_NAME = 'gargantext'
SPIDER_MODULES = ['gargantext.scrapers']
DOWNLOADER_MIDDLEWARES = {
# Will check HTTP responses according to 'expects' attribute of scrapers
'gargantext.datasource.downloadermiddlewares.ExpectsMiddleware': 1,
}
DOWNLOAD_HANDLERS = {
# Enable streamed file processing to handle large files
'file': 'gargantext.datasource.file.FileDownloadHandler',
# Disable s3 handler
's3': None,
}
DOWNLOAD_DELAY = 0.6
CONCURRENT_REQUESTS_PER_IP = 8
......@@ -12,11 +12,18 @@ read -r -d '' DJANGO_VAR <<EOF
DJANGO_SETTINGS_MODULE=$DSM
EOF
read -r -d '' SCRAPY_VAR <<EOF
# Scrapy settings module, it is unlikely that you'll need to change that.
# WARNING: It will be overwritten!
SCRAPY_SETTINGS_MODULE=$DSM
EOF
build_env () {
cat << EOF > $ENV_FILE
# ENVIR can be dev or prod
ENVIR=$ENVIR
$DJANGO_VAR
$SCRAPY_VAR
# Paths of configuration files, you're welcome to change that; when a simple
# filename is given, it'll be searched in current directory.
GARGANTEXT_CONF=$GARGANTEXT_CONF
......@@ -28,6 +35,9 @@ update_env () {
grep -Eq '^\s*DJANGO_SETTINGS_MODULE=' "$ENV_FILE" \
&& sed -E -i "s/^(\\s*DJANGO_SETTINGS_MODULE=).*/\\1$DSM/g" $ENV_FILE \
|| echo "$DJANGO_VAR" >> "$ENV_FILE"
grep -Eq '^\s*SCRAPY_SETTINGS_MODULE=' "$ENV_FILE" \
&& sed -E -i "s/^(\\s*SCRAPY_SETTINGS_MODULE=).*/\\1$DSM/g" $ENV_FILE \
|| echo "$SCRAPY_VAR" >> "$ENV_FILE"
}
[ -f "$ENV_FILE" ] && update_env || build_env
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment