mirror of
https://github.com/zebrajr/imdbscrapper.git
synced 2026-01-15 12:15:12 +00:00
62 lines
2.2 KiB
Python
62 lines
2.2 KiB
Python
import os
|
|
import time
|
|
from time import sleep
|
|
import datetime
|
|
import json
|
|
import requests
|
|
import csv
|
|
import logging
|
|
import mysql.connector as mariadb
|
|
from multiprocessing import Process
|
|
from bs4 import BeautifulSoup
|
|
import rsc.functions as scrapper
|
|
import rsc.helper as helper
|
|
|
|
|
|
|
|
def main():
|
|
helper.cls()
|
|
|
|
#imdbscrapper(903747,903733)
|
|
|
|
|
|
remDuplicates = int(os.getenv('removeDuplicates', 0)) # If 1: call removeDuplicates()
|
|
nrProcesses = int(os.getenv('PROCESSES', 5)) # Number of Processes to start in parallel
|
|
startURL = int(os.getenv('START_URL', 10000000)) # Starting Number
|
|
endURL = int(os.getenv('END_URL', 0)) # Ending Number
|
|
stepUpCycle = int(os.getenv('STEPUPCYCLE', 100)) # How many numbers will be checked in each cycle
|
|
stepUpProcess = int(stepUpCycle / nrProcesses) # Divides the numbers to be checked in each cycle by the total number of processes
|
|
# Eg: 5 Processes running each cycle. 100 Numbers each cycle. 20 Numbers per process
|
|
|
|
# Remove Duplicates on boot?
|
|
if remDuplicates == 1:
|
|
scrapper.removeDuplicates()
|
|
|
|
# Parses the Starting and ending values to temp variables as to keep track
|
|
# [ToDo] it might be unnecessary?
|
|
currentStartURL = startURL
|
|
currentEndURL = startURL - stepUpProcess
|
|
# Execute until endURL is reached
|
|
while currentEndURL > endURL:
|
|
# Initializes a list to old the processes
|
|
processes = []
|
|
# For each process, calculate the starting and ending number, and start a process with those values
|
|
for i in range(nrProcesses):
|
|
print("%s :: Process: %s - Starting: %s - Ending: %s" % (datetime.datetime.now(), i, currentStartURL, currentEndURL))
|
|
if(currentEndURL < endURL):
|
|
currentEndURL = endURL
|
|
processes.append(Process(target=scrapper.imdbscrapper,args=(currentStartURL, currentEndURL)))
|
|
currentStartURL -= stepUpProcess
|
|
currentEndURL -= stepUpProcess
|
|
|
|
for process in processes:
|
|
process.start()
|
|
|
|
for process in processes:
|
|
process.join()
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|