[DOC]

4df691f2 · Alexandre Delanoë · 4df691f2 · 4df691f2
Commit 4df691f2 authored Jun 27, 2023 by Alexandre Delanoë
Show whitespace changes
Inline Side-by-side

Showing with 446 additions and 0 deletions

README.md README.md +2 -0

general_extraction.ipynb docs/general_extraction.ipynb +444 -0

No files found.
--- a/README.md
+++ b/README.md
+# Open Alex Database API Crawler for GarganText
+
--- a/docs/general_extraction.ipynb
+++ b/docs/general_extraction.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0997d71d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 06-26-2023\n",
+    "# sylvain.fontaine@cnrs.fr\n",
+    "# python 3.8"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4602c33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests, re\n",
+    "from tqdm.auto import tqdm\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62b3470f",
+   "metadata": {},
+   "source": [
+    "# Design the URL request for the API\n",
+    "- Set what we want to extract: Sources, Works, Authors, Concepts, Funding...\n",
+    "- For each of them, specific filters has to be set (see the doc for appropriate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3504e1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example for extracting the works published in neuroscience before 2021\n",
+    "type_metadata = 'works'\n",
+    "filters = ['publication_year:<2021','concepts.id:C169760540']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4375fb97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "URL = f'https://api.openalex.org/{type_metadata}?&mailto=my.mail@domain.com&filter='\n",
+    "for f in filters:\n",
+    "    URL += f'{f},'\n",
+    "URL = URL[:-1]\n",
+    "URL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e717661a",
+   "metadata": {},
+   "source": [
+    "#  Query the API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eea544aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = []\n",
+    "cc = requests.get(URL).json()\n",
+    "\n",
+    "# if the number of found publications is lower than 10000, do the extraction per page\n",
+    "\n",
+    "if cc['meta']['count'] < 10000:\n",
+    "    baseRequest = URL + '&per-page=200&page='\n",
+    "    cc = requests.get(baseRequest+'1').json()\n",
+    "    try:\n",
+    "        results += cc['results']\n",
+    "        nb_papers = cc['meta']['count']\n",
+    "        if nb_papers > 200:\n",
+    "            if nb_papers%200 == 0:\n",
+    "                bound = nb_papers//200+1\n",
+    "            else:\n",
+    "                bound = nb_papers//200+2\n",
+    "            for p in range(2,bound):\n",
+    "                cc = requests.get(baseRequest+str(p)).json()\n",
+    "                results += cc['results']\n",
+    "    except KeyError:\n",
+    "        print(cc)\n",
+    "        \n",
+    "# if the number of found publications is higher than 10000, do the extraction with cursor\n",
+    "else:\n",
+    "    baseRequest = URL + '&per-page=200&cursor='\n",
+    "    cursor = '*'\n",
+    "    t = 1\n",
+    "    while t != 0:\n",
+    "        cc = requests.get(baseRequest+cursor).json()\n",
+    "        cursor = cc['meta']['next_cursor']\n",
+    "        t = len(cc['results'])\n",
+    "        results += cc['results']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "80cc941a",
+   "metadata": {},
+   "source": [
+    "# Do some cleaning stuffs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24777356",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from the 'abstract_inverted_index' key per paper in the dataframe, rebuild the proper text\n",
+    "def rebuild_abstract(word_dict):\n",
+    "    if word_dict != None:\n",
+    "        # search the highest index\n",
+    "        max_index = 0\n",
+    "        for ind in word_dict.values():\n",
+    "            local_max = max(ind)\n",
+    "            if local_max > max_index:\n",
+    "                max_index = local_max\n",
+    "        abstract = ['']*(max_index+1)\n",
+    "        for i in word_dict.keys():\n",
+    "            for j in word_dict[i]:\n",
+    "                abstract[j] = i\n",
+    "        abstract = ' '.join(abstract).replace('\\r','').replace('\\n','').replace('\\t','')\n",
+    "        return abstract\n",
+    "    else:\n",
+    "        return('')\n",
+    "\n",
+    "def process_extraction(results):\n",
+    "    papers = pd.DataFrame(results)[['id','type','authorships','title','publication_year','primary_location',\n",
+    "                                    'referenced_works','cited_by_count','counts_by_year','concepts',\n",
+    "                                    'abstract_inverted_index']]\n",
+    "    papers['id'] = papers['id'].apply(lambda x: x.split('W')[-1])\n",
+    "    papers.drop_duplicates(subset='id',keep='first',inplace=True) # avoid more than one time the same work\n",
+    "    papers['referenced_works'] = papers['referenced_works'].apply(lambda x: [r.split('W')[-1] for r in x])\n",
+    "    papers['title'] = papers['title'].apply(lambda x: '' if x==None else x)\n",
+    "    papers['journal_id'] = papers['primary_location'].apply(lambda x: int(x)['source']['id'].split('/S')[-1])\n",
+    "    papers['journal_name'] = papers['primary_location'].apply(lambda x: x['source']['display_name'])\n",
+    "    papers['abstract'] = papers['abstract_inverted_index'].apply(rebuild_abstract)\n",
+    "    del papers['abstract_inverted_index'], papers['primary_location']\n",
+    "    return papers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5877b61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "papers = process_extraction(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbf9cd2a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "papers.to_csv(r'my/path/to/works.csv',sep='\\t',index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "918d3459",
+   "metadata": {},
+   "source": [
+    "# Create a dedicated file of authors (respect the order given by OpenAlex)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b89a18e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "author = {}\n",
+    "chunksize = 100000\n",
+    "for chunk in tqdm(pd.read_csv(r'my/path/to/works.csv',sep='\\t', chunksize=chunksize)):\n",
+    "    chunk['authorships'] = chunk['authorships'].apply(lambda x: eval(x))\n",
+    "    chunk = chunk.explode('authorships').fillna(-1)\n",
+    "    chunk = chunk[chunk['authorships']!=-1]\n",
+    "    for a,y in zip(chunk['authorships'],chunk['publication_year']):\n",
+    "        if 'id' in a['author']:\n",
+    "            ida = int(a['author']['id'].split('/A')[-1])\n",
+    "            if ida not in author:\n",
+    "                author[ida] =  {'name': a['author']['display_name'], \n",
+    "                                 'orcid': a['author']['orcid'],\n",
+    "                                 'affiliations':{y:a['institutions']}}\n",
+    "            else:\n",
+    "                if y not in author[ida]['affiliations']:\n",
+    "                    author[ida]['affiliations'][y] = a['institutions']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67e52693",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "author = pd.DataFrame.from_dict(author, orient='index').reset_index().rename(columns={'index':'id'},inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6b1df74",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "author.to_csv(r'my/path/to/authors.csv',sep='\\t',index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccb3cc95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace the dictionnaries of authors with a list of authors' ids\n",
+    "papers['authorships'] = papers['authorships'].apply(lambda x: [int(a['author']['id'].split('/A')[-1]) for a in x if 'id' in a['author']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5614371d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del papers['authorships']\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c78016b8",
+   "metadata": {},
+   "source": [
+    "# Create a bibliographical citation network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45fd1356",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "papers['referenced_works'] = papers['referenced_works'].apply(lambda x: eval(x))\n",
+    "papers['lenref'] = papers['referenced_works'].apply(lambda x: len(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b67510bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "refs = papers[papers['lenref']>0][['id','referenced_works']].explode('referenced_works')\n",
+    "refs.rename(columns={'id':'citing','referenced_works':'cited'}, inplace=True)\n",
+    "refs['cited'] = refs['cited'].astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c7ea0cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "refs.to_csv(r'my/path/to/citationNet_ref.csv',sep=';')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42b52290",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del papers['referenced_works'], papers['lenref'], refs\n",
+    "gc.collect() "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "584a0212",
+   "metadata": {},
+   "source": [
+    "# Search the citing papers of our subset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba7e6882",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# filter the ids of the papers having at least 1 citation\n",
+    "cited_papers = papers[papers['cited_by_count']>0]['id']\n",
+    "ids_papers = [list(map(str,cited_papers[k:k+50])) for k in range(0,len(cited_papers),50)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cb8d7d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_extraction_citations(results):\n",
+    "    df = pd.DataFrame(results).fillna(-1)\n",
+    "    df['id'] = df['id'].apply(lambda x: int(x.split('W')[-1]))\n",
+    "    df['publication_year'] = df['publication_year'].astype(int)\n",
+    "    df['referenced_works'] = df['referenced_works'].apply(lambda x: [int(r.split('W')[-1]) for r in x])\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f3df79f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# design the URL with only some selected fields in order to avoid over-request\n",
+    "selected_fields = 'id,publication_year,referenced_works'\n",
+    "citesURL = f'https://api.openalex.org/works?&mailto=sylvain.fontaine@cnrs.fr&select={selected_fields}&filter=cites:W'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9fced0e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "k = 0\n",
+    "for ids in tqdm(ids_papers):\n",
+    "    cc = requests.get(citesURL + '|W'.join(ids)).json()\n",
+    "    results = []\n",
+    "    if cc['meta']['count'] < 10000:\n",
+    "        baseRequest = citesURL + '|W'.join(ids) + '&per-page=200&page='\n",
+    "        cc = requests.get(baseRequest + '1').json()\n",
+    "        try:\n",
+    "            results += cc['results']\n",
+    "            nb_papers = cc['meta']['count']\n",
+    "            if nb_papers > 200:\n",
+    "                if nb_papers%200 == 0:\n",
+    "                    bound = nb_papers//200+1\n",
+    "                else:\n",
+    "                    bound = nb_papers//200+2\n",
+    "                for p in range(2,bound):\n",
+    "                    cc = requests.get(baseRequest + str(p)).json()\n",
+    "                    results += cc['results']\n",
+    "        except KeyError:\n",
+    "            print(cc)\n",
+    "    else:\n",
+    "        baseRequest = citesURL + '|W'.join(ids) + '&per-page=200&cursor='\n",
+    "        cursor = '*'\n",
+    "        t = 1\n",
+    "        while t != 0:\n",
+    "            cc = requests.get(baseRequest + cursor).json()\n",
+    "            #print(cc['meta']['count'])\n",
+    "            cursor = cc['meta']['next_cursor']\n",
+    "            results += cc['results']\n",
+    "            t = len(cc['results'])\n",
+    "\n",
+    "    # aggregate the results in a dataframe\n",
+    "    if k == 0:\n",
+    "        citing_papers = process_extraction_citations(results)\n",
+    "        citationNet = citing_papers[['id','referenced_works','publication_year']].explode('referenced_works').rename(columns={'id':'citing','referenced_works':'cited'})\n",
+    "        citationNet = citationNet[citationNet['cited'].isin(list(map(int,ids)))]\n",
+    "        del citing_papers['referenced_works']\n",
+    "    else:\n",
+    "        cp = process_extraction_citations(results)\n",
+    "        cn = cp[['id','referenced_works','publication_year']].explode('referenced_works').rename(columns={'id':'citing','referenced_works':'cited'})\n",
+    "        cn = cn[cn['cited'].isin(list(map(int,ids)))]\n",
+    "        del cp['referenced_works']\n",
+    "        citing_papers = pd.concat([citing_papers,cp])\n",
+    "        citationNet = pd.concat([citationNet,cn])    \n",
+    "    k += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eaa2c5e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "citationNet.to_csv(r'my/path/to/citationNet_citation.csv',sep='\\t',index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "my_env",
+   "language": "python",
+   "name": "my_env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}