Commit 4df691f2 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DOC]

parents
# Open Alex Database API Crawler for GarganText
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0997d71d",
"metadata": {},
"outputs": [],
"source": [
"# 06-26-2023\n",
"# sylvain.fontaine@cnrs.fr\n",
"# python 3.8"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4602c33",
"metadata": {},
"outputs": [],
"source": [
"import requests, re\n",
"from tqdm.auto import tqdm\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "62b3470f",
"metadata": {},
"source": [
"# Design the URL request for the API\n",
"- Set what we want to extract: Sources, Works, Authors, Concepts, Funding...\n",
"- For each of them, specific filters has to be set (see the doc for appropriate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3504e1b",
"metadata": {},
"outputs": [],
"source": [
"# example for extracting the works published in neuroscience before 2021\n",
"type_metadata = 'works'\n",
"filters = ['publication_year:<2021','concepts.id:C169760540']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4375fb97",
"metadata": {},
"outputs": [],
"source": [
"URL = f'https://api.openalex.org/{type_metadata}?&mailto=my.mail@domain.com&filter='\n",
"for f in filters:\n",
" URL += f'{f},'\n",
"URL = URL[:-1]\n",
"URL"
]
},
{
"cell_type": "markdown",
"id": "e717661a",
"metadata": {},
"source": [
"# Query the API"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eea544aa",
"metadata": {},
"outputs": [],
"source": [
"results = []\n",
"cc = requests.get(URL).json()\n",
"\n",
"# if the number of found publications is lower than 10000, do the extraction per page\n",
"\n",
"if cc['meta']['count'] < 10000:\n",
" baseRequest = URL + '&per-page=200&page='\n",
" cc = requests.get(baseRequest+'1').json()\n",
" try:\n",
" results += cc['results']\n",
" nb_papers = cc['meta']['count']\n",
" if nb_papers > 200:\n",
" if nb_papers%200 == 0:\n",
" bound = nb_papers//200+1\n",
" else:\n",
" bound = nb_papers//200+2\n",
" for p in range(2,bound):\n",
" cc = requests.get(baseRequest+str(p)).json()\n",
" results += cc['results']\n",
" except KeyError:\n",
" print(cc)\n",
" \n",
"# if the number of found publications is higher than 10000, do the extraction with cursor\n",
"else:\n",
" baseRequest = URL + '&per-page=200&cursor='\n",
" cursor = '*'\n",
" t = 1\n",
" while t != 0:\n",
" cc = requests.get(baseRequest+cursor).json()\n",
" cursor = cc['meta']['next_cursor']\n",
" t = len(cc['results'])\n",
" results += cc['results']"
]
},
{
"cell_type": "markdown",
"id": "80cc941a",
"metadata": {},
"source": [
"# Do some cleaning stuffs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24777356",
"metadata": {},
"outputs": [],
"source": [
"# from the 'abstract_inverted_index' key per paper in the dataframe, rebuild the proper text\n",
"def rebuild_abstract(word_dict):\n",
" if word_dict != None:\n",
" # search the highest index\n",
" max_index = 0\n",
" for ind in word_dict.values():\n",
" local_max = max(ind)\n",
" if local_max > max_index:\n",
" max_index = local_max\n",
" abstract = ['']*(max_index+1)\n",
" for i in word_dict.keys():\n",
" for j in word_dict[i]:\n",
" abstract[j] = i\n",
" abstract = ' '.join(abstract).replace('\\r','').replace('\\n','').replace('\\t','')\n",
" return abstract\n",
" else:\n",
" return('')\n",
"\n",
"def process_extraction(results):\n",
" papers = pd.DataFrame(results)[['id','type','authorships','title','publication_year','primary_location',\n",
" 'referenced_works','cited_by_count','counts_by_year','concepts',\n",
" 'abstract_inverted_index']]\n",
" papers['id'] = papers['id'].apply(lambda x: x.split('W')[-1])\n",
" papers.drop_duplicates(subset='id',keep='first',inplace=True) # avoid more than one time the same work\n",
" papers['referenced_works'] = papers['referenced_works'].apply(lambda x: [r.split('W')[-1] for r in x])\n",
" papers['title'] = papers['title'].apply(lambda x: '' if x==None else x)\n",
" papers['journal_id'] = papers['primary_location'].apply(lambda x: int(x)['source']['id'].split('/S')[-1])\n",
" papers['journal_name'] = papers['primary_location'].apply(lambda x: x['source']['display_name'])\n",
" papers['abstract'] = papers['abstract_inverted_index'].apply(rebuild_abstract)\n",
" del papers['abstract_inverted_index'], papers['primary_location']\n",
" return papers"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5877b61",
"metadata": {},
"outputs": [],
"source": [
"papers = process_extraction(results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fbf9cd2a",
"metadata": {},
"outputs": [],
"source": [
"papers.to_csv(r'my/path/to/works.csv',sep='\\t',index=False)"
]
},
{
"cell_type": "markdown",
"id": "918d3459",
"metadata": {},
"source": [
"# Create a dedicated file of authors (respect the order given by OpenAlex)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b89a18e",
"metadata": {},
"outputs": [],
"source": [
"author = {}\n",
"chunksize = 100000\n",
"for chunk in tqdm(pd.read_csv(r'my/path/to/works.csv',sep='\\t', chunksize=chunksize)):\n",
" chunk['authorships'] = chunk['authorships'].apply(lambda x: eval(x))\n",
" chunk = chunk.explode('authorships').fillna(-1)\n",
" chunk = chunk[chunk['authorships']!=-1]\n",
" for a,y in zip(chunk['authorships'],chunk['publication_year']):\n",
" if 'id' in a['author']:\n",
" ida = int(a['author']['id'].split('/A')[-1])\n",
" if ida not in author:\n",
" author[ida] = {'name': a['author']['display_name'], \n",
" 'orcid': a['author']['orcid'],\n",
" 'affiliations':{y:a['institutions']}}\n",
" else:\n",
" if y not in author[ida]['affiliations']:\n",
" author[ida]['affiliations'][y] = a['institutions']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67e52693",
"metadata": {},
"outputs": [],
"source": [
"author = pd.DataFrame.from_dict(author, orient='index').reset_index().rename(columns={'index':'id'},inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6b1df74",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"author.to_csv(r'my/path/to/authors.csv',sep='\\t',index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ccb3cc95",
"metadata": {},
"outputs": [],
"source": [
"# replace the dictionnaries of authors with a list of authors' ids\n",
"papers['authorships'] = papers['authorships'].apply(lambda x: [int(a['author']['id'].split('/A')[-1]) for a in x if 'id' in a['author']])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5614371d",
"metadata": {},
"outputs": [],
"source": [
"del papers['authorships']\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "c78016b8",
"metadata": {},
"source": [
"# Create a bibliographical citation network"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45fd1356",
"metadata": {},
"outputs": [],
"source": [
"papers['referenced_works'] = papers['referenced_works'].apply(lambda x: eval(x))\n",
"papers['lenref'] = papers['referenced_works'].apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b67510bf",
"metadata": {},
"outputs": [],
"source": [
"refs = papers[papers['lenref']>0][['id','referenced_works']].explode('referenced_works')\n",
"refs.rename(columns={'id':'citing','referenced_works':'cited'}, inplace=True)\n",
"refs['cited'] = refs['cited'].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c7ea0cd",
"metadata": {},
"outputs": [],
"source": [
"refs.to_csv(r'my/path/to/citationNet_ref.csv',sep=';')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42b52290",
"metadata": {},
"outputs": [],
"source": [
"del papers['referenced_works'], papers['lenref'], refs\n",
"gc.collect() "
]
},
{
"cell_type": "markdown",
"id": "584a0212",
"metadata": {},
"source": [
"# Search the citing papers of our subset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba7e6882",
"metadata": {},
"outputs": [],
"source": [
"# filter the ids of the papers having at least 1 citation\n",
"cited_papers = papers[papers['cited_by_count']>0]['id']\n",
"ids_papers = [list(map(str,cited_papers[k:k+50])) for k in range(0,len(cited_papers),50)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3cb8d7d4",
"metadata": {},
"outputs": [],
"source": [
"def process_extraction_citations(results):\n",
" df = pd.DataFrame(results).fillna(-1)\n",
" df['id'] = df['id'].apply(lambda x: int(x.split('W')[-1]))\n",
" df['publication_year'] = df['publication_year'].astype(int)\n",
" df['referenced_works'] = df['referenced_works'].apply(lambda x: [int(r.split('W')[-1]) for r in x])\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f3df79f",
"metadata": {},
"outputs": [],
"source": [
"# design the URL with only some selected fields in order to avoid over-request\n",
"selected_fields = 'id,publication_year,referenced_works'\n",
"citesURL = f'https://api.openalex.org/works?&mailto=sylvain.fontaine@cnrs.fr&select={selected_fields}&filter=cites:W'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9fced0e",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"k = 0\n",
"for ids in tqdm(ids_papers):\n",
" cc = requests.get(citesURL + '|W'.join(ids)).json()\n",
" results = []\n",
" if cc['meta']['count'] < 10000:\n",
" baseRequest = citesURL + '|W'.join(ids) + '&per-page=200&page='\n",
" cc = requests.get(baseRequest + '1').json()\n",
" try:\n",
" results += cc['results']\n",
" nb_papers = cc['meta']['count']\n",
" if nb_papers > 200:\n",
" if nb_papers%200 == 0:\n",
" bound = nb_papers//200+1\n",
" else:\n",
" bound = nb_papers//200+2\n",
" for p in range(2,bound):\n",
" cc = requests.get(baseRequest + str(p)).json()\n",
" results += cc['results']\n",
" except KeyError:\n",
" print(cc)\n",
" else:\n",
" baseRequest = citesURL + '|W'.join(ids) + '&per-page=200&cursor='\n",
" cursor = '*'\n",
" t = 1\n",
" while t != 0:\n",
" cc = requests.get(baseRequest + cursor).json()\n",
" #print(cc['meta']['count'])\n",
" cursor = cc['meta']['next_cursor']\n",
" results += cc['results']\n",
" t = len(cc['results'])\n",
"\n",
" # aggregate the results in a dataframe\n",
" if k == 0:\n",
" citing_papers = process_extraction_citations(results)\n",
" citationNet = citing_papers[['id','referenced_works','publication_year']].explode('referenced_works').rename(columns={'id':'citing','referenced_works':'cited'})\n",
" citationNet = citationNet[citationNet['cited'].isin(list(map(int,ids)))]\n",
" del citing_papers['referenced_works']\n",
" else:\n",
" cp = process_extraction_citations(results)\n",
" cn = cp[['id','referenced_works','publication_year']].explode('referenced_works').rename(columns={'id':'citing','referenced_works':'cited'})\n",
" cn = cn[cn['cited'].isin(list(map(int,ids)))]\n",
" del cp['referenced_works']\n",
" citing_papers = pd.concat([citing_papers,cp])\n",
" citationNet = pd.concat([citationNet,cn]) \n",
" k += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eaa2c5e1",
"metadata": {},
"outputs": [],
"source": [
"citationNet.to_csv(r'my/path/to/citationNet_citation.csv',sep='\\t',index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "my_env",
"language": "python",
"name": "my_env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment