Skip to content

Script for Bulk Downloading Documents from Sci-hub

Script for Bulk Downloading Documents from Sci-hub

I. Instructions

  1. Don't order fry rice in a bar.
  2. Place bulk_download_script.py and doi.txt in a same directory, and run the script directly. The script will automatically download articles corresponding to the DOIs and return information on the documents that cannot be downloaded, which will be stored in the error.txt file.
  3. Some .pdf files may not open properly. If so, directly run the test_pdf_integrity.py to detect this issue. The script will return a unopenable_file.txtfile, which records the DOIs that can not open. You can rename this txt file to doi.txtand rerun bulk_download_script.py ,or manually search the missing articles.
  4. Some articles may not be available on Sci-hub, and in such cases, you should search those articles manually.
  5. If Sci-hub's url cannot be accessed, you can edit it in bulk_download_script.py
  6. When saving as .pdf files, the script replaces "/" with "_", which may cause issues in some cases.
  7. The format for storing DOIs in the doi.txt file is described later in the document

II. Codes

1. example of doi.txt

10.1038/s41567-024-02411-5
10.1063/5.0189673
10.1021/acs.jpcc.3c05552
10.1002/anie.202315666
10.1002/ange.202308803
10.1002/anie.202308803
10.1002/ange.202315666
10.1021/acs.jpclett.3c02655
10.1021/acs.jpca.3c05841
10.1021/acs.jpclett.3c01785
10.1021/jacs.3c06099


# Remove the `http://` and leave only numbers and letters. Each DOI needs a *newline* and a *blank line* at the end of the text.

Remove the http:// and leave only numbers and letters. Each DOI needs a newline and a blank line at the end of the text.

2. bulk_download_script.py

import os
import threading

import requests
from bs4 import BeautifulSoup

# Create a folder to save articles
path = os.path.dirname(os.path.abspath(__file__)) 
if not os.path.exists(path):
    os.mkdir(path)

# Request Headers
head = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"
}


# function of downloading papers
def download_paper(doi):
    # 拼接Sci-Hub链接
    url = "https://www.sci-hub.wf/" + doi + "#"

    try:
        download_url = ""

        # Sending HTTP requests and parsing HTML pages
        r = requests.get(url, headers=head)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # Parsing to obtain the download link for the papers
        if soup.iframe == None:
            download_url = "https:" + soup.embed.attrs["src"]
        else:
            download_url = soup.iframe.attrs["src"]

        # Downloading the literature and saving it to a file
        print(doi + "\t正在下载\n下载链接为\t" + download_url)
        download_r = requests.get(download_url, headers=head)
        download_r.raise_for_status()
        with open(doi.replace("/", "_") + ".pdf", "wb+") as temp:
            temp.write(download_r.content)

        print(doi + "\t文献下载成功.\n")

    # Recording error information when download fails
    except Exception as e:
        with open("error.log", "a+") as error:
            error.write(doi + "\t下载失败!\n")
            if download_url.startswith("https://"):
                error.write("下载url链接为: " + download_url + "\n")
            error.write(str(e) + "\n\n")


# Opening a txt file containing DOIs
with open(path + "\\doi.txt", "r", encoding="utf-8") as f:
    # Iterating through and reading DOI numbers, and initiating multi-threaded download of articles
    threads = []
    for line in f:
        doi = line.strip()
        t = threading.Thread(target=download_paper, args=(doi,))
        threads.append(t)

    # Starting all threads
    for t in threads:
        t.start()

    # Waiting for all threads to complete
    for t in threads:
        t.join()

3. test_pdf_integrity.py

import os
from PyPDF2 import PdfReader


def test_pdf_files_in_script_directory():
    # Getting the directory where the script is located
    script_dir = os.path.dirname(os.path.abspath(__file__))

    # Recording error information in a log file
    log_file = open("unopenable_pdfs.txt", "a", encoding="utf-8")

    # Iterating through all files in the directory
    for filename in os.listdir(script_dir):
        if filename.endswith(".pdf"):
            file_path = os.path.join(script_dir, filename)
            try:
                # try to open pdf files 
                with open(file_path, 'rb') as file:
                    reader = PdfReader(file)
                    # Reading the first page to check if the pdf is corrupted
                    reader.pages[0]
                print(f"{filename} 能够正常打开。")
            except Exception as e:
                # Writing error information to a log file
                log_file.write(f"{filename.replace('_','/')[:-4]}\n")
                print(f"无法打开 {filename}: {e}")

            # close the log file

    log_file.close()


# run
if __name__ == '__main__'
    test_pdf_files_in_script_directory()