mirror of
https://github.com/zebrajr/imdbscrapper.git
synced 2026-01-15 12:15:12 +00:00
Started multithreadding POC
This commit is contained in:
19
src/processes.py
Normal file
19
src/processes.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
from multiprocessing import Process
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
|
||||||
|
def calc():
|
||||||
|
for i in range(0, 70000000):
|
||||||
|
math.sqrt(i)
|
||||||
|
|
||||||
|
processes = []
|
||||||
|
|
||||||
|
for i in range(os.cpu_count()):
|
||||||
|
print('registering process %d' % i)
|
||||||
|
processes.append(Process(target=calc))
|
||||||
|
|
||||||
|
for process in processes:
|
||||||
|
process.start()
|
||||||
|
|
||||||
|
for process in processes:
|
||||||
|
process.join()
|
||||||
@@ -4,24 +4,32 @@ import json
|
|||||||
import requests
|
import requests
|
||||||
import csv
|
import csv
|
||||||
import logging
|
import logging
|
||||||
|
from threading import Thread
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
def cls():
|
def cls():
|
||||||
os.system('cls' if os.name=='nt' else 'clear')
|
os.system('cls' if os.name=='nt' else 'clear')
|
||||||
|
|
||||||
def main():
|
def imdbscrapper(i):
|
||||||
cls()
|
cpuCount = int(os.cpu_count()) - 1
|
||||||
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
|
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
|
||||||
startURL = 9999999 # Start Number
|
startURL = 9999999 # Start Number
|
||||||
endURL = 0 # Ending Number
|
endURL = 0 # Ending Number
|
||||||
debugLevel = 40 # 20 will display Info messages, 40 errors
|
debugLevel = 40 # 20 will display Info messages, 40 errors
|
||||||
logFile = "/opt/storage/info.log" # Log output
|
logFile = "/opt/storage/info.log" # Log output
|
||||||
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
|
counterFileBase = "/opt/storage/counter" # Base for the counter file
|
||||||
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
|
counterFileExt = ".txt"
|
||||||
|
#counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
|
||||||
|
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
|
||||||
|
|
||||||
|
|
||||||
|
startURL = int((startURL/cpuCount)*i)
|
||||||
|
endURL = int(i*(startURL/cpuCount)+1)
|
||||||
|
|
||||||
|
|
||||||
table = []
|
|
||||||
try:
|
try:
|
||||||
|
# Tries to read the value to continue from counterN.txt. On error defaults to StartURL
|
||||||
|
counterFile = counterFileBase + str(i) + counterFileExt
|
||||||
counter = open(counterFile, "r")
|
counter = open(counterFile, "r")
|
||||||
startURL = int(counter.read())
|
startURL = int(counter.read())
|
||||||
counter.close()
|
counter.close()
|
||||||
@@ -91,6 +99,23 @@ def main():
|
|||||||
counter.write(str(i))
|
counter.write(str(i))
|
||||||
counter.close()
|
counter.close()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
cls()
|
||||||
|
# Comment in/out if you want 90% or full performance (not implemented)
|
||||||
|
cpuCount = int(os.cpu_count()) - 1
|
||||||
|
#cpuCount = 4
|
||||||
|
|
||||||
|
threads = []
|
||||||
|
for i in range(cpuCount):
|
||||||
|
print(i)
|
||||||
|
threads.append(Thread(target=imdbscrapper(i)))
|
||||||
|
|
||||||
|
for thread in threads:
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
for thread in threads:
|
||||||
|
thread.join()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
19
src/threads.py
Normal file
19
src/threads.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
from threading import Thread
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
|
||||||
|
def calc():
|
||||||
|
for i in range(0, 4000000):
|
||||||
|
math.sqrt(i)
|
||||||
|
|
||||||
|
threads = []
|
||||||
|
|
||||||
|
for i in range(os.cpu_count()):
|
||||||
|
print('registering thread %d' % i)
|
||||||
|
threads.append(Thread(target=calc))
|
||||||
|
|
||||||
|
for thread in threads:
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
for thread in threads:
|
||||||
|
thread.join()
|
||||||
43789
storage/recheck.txt
43789
storage/recheck.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user