Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
openalex
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
2
Issues
2
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
crawlers
openalex
Commits
4df691f2
Commit
4df691f2
authored
Jun 27, 2023
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[DOC]
parents
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
446 additions
and
0 deletions
+446
-0
README.md
README.md
+2
-0
general_extraction.ipynb
docs/general_extraction.ipynb
+444
-0
No files found.
README.md
0 → 100644
View file @
4df691f2
# Open Alex Database API Crawler for GarganText
docs/general_extraction.ipynb
0 → 100644
View file @
4df691f2
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0997d71d",
"metadata": {},
"outputs": [],
"source": [
"# 06-26-2023\n",
"# sylvain.fontaine@cnrs.fr\n",
"# python 3.8"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4602c33",
"metadata": {},
"outputs": [],
"source": [
"import requests, re\n",
"from tqdm.auto import tqdm\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "62b3470f",
"metadata": {},
"source": [
"# Design the URL request for the API\n",
"- Set what we want to extract: Sources, Works, Authors, Concepts, Funding...\n",
"- For each of them, specific filters has to be set (see the doc for appropriate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3504e1b",
"metadata": {},
"outputs": [],
"source": [
"# example for extracting the works published in neuroscience before 2021\n",
"type_metadata = 'works'\n",
"filters = ['publication_year:<2021','concepts.id:C169760540']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4375fb97",
"metadata": {},
"outputs": [],
"source": [
"URL = f'https://api.openalex.org/{type_metadata}?&mailto=my.mail@domain.com&filter='\n",
"for f in filters:\n",
" URL += f'{f},'\n",
"URL = URL[:-1]\n",
"URL"
]
},
{
"cell_type": "markdown",
"id": "e717661a",
"metadata": {},
"source": [
"# Query the API"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eea544aa",
"metadata": {},
"outputs": [],
"source": [
"results = []\n",
"cc = requests.get(URL).json()\n",
"\n",
"# if the number of found publications is lower than 10000, do the extraction per page\n",
"\n",
"if cc['meta']['count'] < 10000:\n",
" baseRequest = URL + '&per-page=200&page='\n",
" cc = requests.get(baseRequest+'1').json()\n",
" try:\n",
" results += cc['results']\n",
" nb_papers = cc['meta']['count']\n",
" if nb_papers > 200:\n",
" if nb_papers%200 == 0:\n",
" bound = nb_papers//200+1\n",
" else:\n",
" bound = nb_papers//200+2\n",
" for p in range(2,bound):\n",
" cc = requests.get(baseRequest+str(p)).json()\n",
" results += cc['results']\n",
" except KeyError:\n",
" print(cc)\n",
" \n",
"# if the number of found publications is higher than 10000, do the extraction with cursor\n",
"else:\n",
" baseRequest = URL + '&per-page=200&cursor='\n",
" cursor = '*'\n",
" t = 1\n",
" while t != 0:\n",
" cc = requests.get(baseRequest+cursor).json()\n",
" cursor = cc['meta']['next_cursor']\n",
" t = len(cc['results'])\n",
" results += cc['results']"
]
},
{
"cell_type": "markdown",
"id": "80cc941a",
"metadata": {},
"source": [
"# Do some cleaning stuffs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24777356",
"metadata": {},
"outputs": [],
"source": [
"# from the 'abstract_inverted_index' key per paper in the dataframe, rebuild the proper text\n",
"def rebuild_abstract(word_dict):\n",
" if word_dict != None:\n",
" # search the highest index\n",
" max_index = 0\n",
" for ind in word_dict.values():\n",
" local_max = max(ind)\n",
" if local_max > max_index:\n",
" max_index = local_max\n",
" abstract = ['']*(max_index+1)\n",
" for i in word_dict.keys():\n",
" for j in word_dict[i]:\n",
" abstract[j] = i\n",
" abstract = ' '.join(abstract).replace('\\r','').replace('\\n','').replace('\\t','')\n",
" return abstract\n",
" else:\n",
" return('')\n",
"\n",
"def process_extraction(results):\n",
" papers = pd.DataFrame(results)[['id','type','authorships','title','publication_year','primary_location',\n",
" 'referenced_works','cited_by_count','counts_by_year','concepts',\n",
" 'abstract_inverted_index']]\n",
" papers['id'] = papers['id'].apply(lambda x: x.split('W')[-1])\n",
" papers.drop_duplicates(subset='id',keep='first',inplace=True) # avoid more than one time the same work\n",
" papers['referenced_works'] = papers['referenced_works'].apply(lambda x: [r.split('W')[-1] for r in x])\n",
" papers['title'] = papers['title'].apply(lambda x: '' if x==None else x)\n",
" papers['journal_id'] = papers['primary_location'].apply(lambda x: int(x)['source']['id'].split('/S')[-1])\n",
" papers['journal_name'] = papers['primary_location'].apply(lambda x: x['source']['display_name'])\n",
" papers['abstract'] = papers['abstract_inverted_index'].apply(rebuild_abstract)\n",
" del papers['abstract_inverted_index'], papers['primary_location']\n",
" return papers"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5877b61",
"metadata": {},
"outputs": [],
"source": [
"papers = process_extraction(results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fbf9cd2a",
"metadata": {},
"outputs": [],
"source": [
"papers.to_csv(r'my/path/to/works.csv',sep='\\t',index=False)"
]
},
{
"cell_type": "markdown",
"id": "918d3459",
"metadata": {},
"source": [
"# Create a dedicated file of authors (respect the order given by OpenAlex)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b89a18e",
"metadata": {},
"outputs": [],
"source": [
"author = {}\n",
"chunksize = 100000\n",
"for chunk in tqdm(pd.read_csv(r'my/path/to/works.csv',sep='\\t', chunksize=chunksize)):\n",
" chunk['authorships'] = chunk['authorships'].apply(lambda x: eval(x))\n",
" chunk = chunk.explode('authorships').fillna(-1)\n",
" chunk = chunk[chunk['authorships']!=-1]\n",
" for a,y in zip(chunk['authorships'],chunk['publication_year']):\n",
" if 'id' in a['author']:\n",
" ida = int(a['author']['id'].split('/A')[-1])\n",
" if ida not in author:\n",
" author[ida] = {'name': a['author']['display_name'], \n",
" 'orcid': a['author']['orcid'],\n",
" 'affiliations':{y:a['institutions']}}\n",
" else:\n",
" if y not in author[ida]['affiliations']:\n",
" author[ida]['affiliations'][y] = a['institutions']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67e52693",
"metadata": {},
"outputs": [],
"source": [
"author = pd.DataFrame.from_dict(author, orient='index').reset_index().rename(columns={'index':'id'},inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6b1df74",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"author.to_csv(r'my/path/to/authors.csv',sep='\\t',index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ccb3cc95",
"metadata": {},
"outputs": [],
"source": [
"# replace the dictionnaries of authors with a list of authors' ids\n",
"papers['authorships'] = papers['authorships'].apply(lambda x: [int(a['author']['id'].split('/A')[-1]) for a in x if 'id' in a['author']])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5614371d",
"metadata": {},
"outputs": [],
"source": [
"del papers['authorships']\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "c78016b8",
"metadata": {},
"source": [
"# Create a bibliographical citation network"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45fd1356",
"metadata": {},
"outputs": [],
"source": [
"papers['referenced_works'] = papers['referenced_works'].apply(lambda x: eval(x))\n",
"papers['lenref'] = papers['referenced_works'].apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b67510bf",
"metadata": {},
"outputs": [],
"source": [
"refs = papers[papers['lenref']>0][['id','referenced_works']].explode('referenced_works')\n",
"refs.rename(columns={'id':'citing','referenced_works':'cited'}, inplace=True)\n",
"refs['cited'] = refs['cited'].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c7ea0cd",
"metadata": {},
"outputs": [],
"source": [
"refs.to_csv(r'my/path/to/citationNet_ref.csv',sep=';')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42b52290",
"metadata": {},
"outputs": [],
"source": [
"del papers['referenced_works'], papers['lenref'], refs\n",
"gc.collect() "
]
},
{
"cell_type": "markdown",
"id": "584a0212",
"metadata": {},
"source": [
"# Search the citing papers of our subset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba7e6882",
"metadata": {},
"outputs": [],
"source": [
"# filter the ids of the papers having at least 1 citation\n",
"cited_papers = papers[papers['cited_by_count']>0]['id']\n",
"ids_papers = [list(map(str,cited_papers[k:k+50])) for k in range(0,len(cited_papers),50)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3cb8d7d4",
"metadata": {},
"outputs": [],
"source": [
"def process_extraction_citations(results):\n",
" df = pd.DataFrame(results).fillna(-1)\n",
" df['id'] = df['id'].apply(lambda x: int(x.split('W')[-1]))\n",
" df['publication_year'] = df['publication_year'].astype(int)\n",
" df['referenced_works'] = df['referenced_works'].apply(lambda x: [int(r.split('W')[-1]) for r in x])\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f3df79f",
"metadata": {},
"outputs": [],
"source": [
"# design the URL with only some selected fields in order to avoid over-request\n",
"selected_fields = 'id,publication_year,referenced_works'\n",
"citesURL = f'https://api.openalex.org/works?&mailto=sylvain.fontaine@cnrs.fr&select={selected_fields}&filter=cites:W'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9fced0e",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"k = 0\n",
"for ids in tqdm(ids_papers):\n",
" cc = requests.get(citesURL + '|W'.join(ids)).json()\n",
" results = []\n",
" if cc['meta']['count'] < 10000:\n",
" baseRequest = citesURL + '|W'.join(ids) + '&per-page=200&page='\n",
" cc = requests.get(baseRequest + '1').json()\n",
" try:\n",
" results += cc['results']\n",
" nb_papers = cc['meta']['count']\n",
" if nb_papers > 200:\n",
" if nb_papers%200 == 0:\n",
" bound = nb_papers//200+1\n",
" else:\n",
" bound = nb_papers//200+2\n",
" for p in range(2,bound):\n",
" cc = requests.get(baseRequest + str(p)).json()\n",
" results += cc['results']\n",
" except KeyError:\n",
" print(cc)\n",
" else:\n",
" baseRequest = citesURL + '|W'.join(ids) + '&per-page=200&cursor='\n",
" cursor = '*'\n",
" t = 1\n",
" while t != 0:\n",
" cc = requests.get(baseRequest + cursor).json()\n",
" #print(cc['meta']['count'])\n",
" cursor = cc['meta']['next_cursor']\n",
" results += cc['results']\n",
" t = len(cc['results'])\n",
"\n",
" # aggregate the results in a dataframe\n",
" if k == 0:\n",
" citing_papers = process_extraction_citations(results)\n",
" citationNet = citing_papers[['id','referenced_works','publication_year']].explode('referenced_works').rename(columns={'id':'citing','referenced_works':'cited'})\n",
" citationNet = citationNet[citationNet['cited'].isin(list(map(int,ids)))]\n",
" del citing_papers['referenced_works']\n",
" else:\n",
" cp = process_extraction_citations(results)\n",
" cn = cp[['id','referenced_works','publication_year']].explode('referenced_works').rename(columns={'id':'citing','referenced_works':'cited'})\n",
" cn = cn[cn['cited'].isin(list(map(int,ids)))]\n",
" del cp['referenced_works']\n",
" citing_papers = pd.concat([citing_papers,cp])\n",
" citationNet = pd.concat([citationNet,cn]) \n",
" k += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eaa2c5e1",
"metadata": {},
"outputs": [],
"source": [
"citationNet.to_csv(r'my/path/to/citationNet_citation.csv',sep='\\t',index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "my_env",
"language": "python",
"name": "my_env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment