mirror of
https://github.com/zebrajr/imdbscrapper.git
synced 2026-01-15 12:15:12 +00:00
Started multithreadding POC
This commit is contained in:
19
src/processes.py
Normal file
19
src/processes.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from multiprocessing import Process
|
||||
import os
|
||||
import math
|
||||
|
||||
def calc():
|
||||
for i in range(0, 70000000):
|
||||
math.sqrt(i)
|
||||
|
||||
processes = []
|
||||
|
||||
for i in range(os.cpu_count()):
|
||||
print('registering process %d' % i)
|
||||
processes.append(Process(target=calc))
|
||||
|
||||
for process in processes:
|
||||
process.start()
|
||||
|
||||
for process in processes:
|
||||
process.join()
|
||||
@@ -4,24 +4,32 @@ import json
|
||||
import requests
|
||||
import csv
|
||||
import logging
|
||||
from threading import Thread
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def cls():
|
||||
os.system('cls' if os.name=='nt' else 'clear')
|
||||
|
||||
def main():
|
||||
cls()
|
||||
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
|
||||
startURL = 9999999 # Start Number
|
||||
endURL = 0 # Ending Number
|
||||
debugLevel = 40 # 20 will display Info messages, 40 errors
|
||||
logFile = "/opt/storage/info.log" # Log output
|
||||
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
|
||||
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
|
||||
def imdbscrapper(i):
|
||||
cpuCount = int(os.cpu_count()) - 1
|
||||
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
|
||||
startURL = 9999999 # Start Number
|
||||
endURL = 0 # Ending Number
|
||||
debugLevel = 40 # 20 will display Info messages, 40 errors
|
||||
logFile = "/opt/storage/info.log" # Log output
|
||||
counterFileBase = "/opt/storage/counter" # Base for the counter file
|
||||
counterFileExt = ".txt"
|
||||
#counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
|
||||
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
|
||||
|
||||
|
||||
startURL = int((startURL/cpuCount)*i)
|
||||
endURL = int(i*(startURL/cpuCount)+1)
|
||||
|
||||
|
||||
table = []
|
||||
try:
|
||||
# Tries to read the value to continue from counterN.txt. On error defaults to StartURL
|
||||
counterFile = counterFileBase + str(i) + counterFileExt
|
||||
counter = open(counterFile, "r")
|
||||
startURL = int(counter.read())
|
||||
counter.close()
|
||||
@@ -91,6 +99,23 @@ def main():
|
||||
counter.write(str(i))
|
||||
counter.close()
|
||||
|
||||
def main():
|
||||
cls()
|
||||
# Comment in/out if you want 90% or full performance (not implemented)
|
||||
cpuCount = int(os.cpu_count()) - 1
|
||||
#cpuCount = 4
|
||||
|
||||
threads = []
|
||||
for i in range(cpuCount):
|
||||
print(i)
|
||||
threads.append(Thread(target=imdbscrapper(i)))
|
||||
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
19
src/threads.py
Normal file
19
src/threads.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from threading import Thread
|
||||
import os
|
||||
import math
|
||||
|
||||
def calc():
|
||||
for i in range(0, 4000000):
|
||||
math.sqrt(i)
|
||||
|
||||
threads = []
|
||||
|
||||
for i in range(os.cpu_count()):
|
||||
print('registering thread %d' % i)
|
||||
threads.append(Thread(target=calc))
|
||||
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
43789
storage/recheck.txt
43789
storage/recheck.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user