Started multithreadding POC

This commit is contained in:
Carlos Sousa
2021-05-03 02:06:12 +02:00
parent b1984e1fdf
commit f5cb768a65
4 changed files with 94 additions and 43778 deletions

19
src/processes.py Normal file
View File

@@ -0,0 +1,19 @@
from multiprocessing import Process
import os
import math
def calc():
for i in range(0, 70000000):
math.sqrt(i)
processes = []
for i in range(os.cpu_count()):
print('registering process %d' % i)
processes.append(Process(target=calc))
for process in processes:
process.start()
for process in processes:
process.join()

View File

@@ -4,24 +4,32 @@ import json
import requests import requests
import csv import csv
import logging import logging
from threading import Thread
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def cls(): def cls():
os.system('cls' if os.name=='nt' else 'clear') os.system('cls' if os.name=='nt' else 'clear')
def main(): def imdbscrapper(i):
cls() cpuCount = int(os.cpu_count()) - 1
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
startURL = 9999999 # Start Number startURL = 9999999 # Start Number
endURL = 0 # Ending Number endURL = 0 # Ending Number
debugLevel = 40 # 20 will display Info messages, 40 errors debugLevel = 40 # 20 will display Info messages, 40 errors
logFile = "/opt/storage/info.log" # Log output logFile = "/opt/storage/info.log" # Log output
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned counterFileBase = "/opt/storage/counter" # Base for the counter file
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck counterFileExt = ".txt"
#counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
startURL = int((startURL/cpuCount)*i)
endURL = int(i*(startURL/cpuCount)+1)
table = []
try: try:
# Tries to read the value to continue from counterN.txt. On error defaults to StartURL
counterFile = counterFileBase + str(i) + counterFileExt
counter = open(counterFile, "r") counter = open(counterFile, "r")
startURL = int(counter.read()) startURL = int(counter.read())
counter.close() counter.close()
@@ -91,6 +99,23 @@ def main():
counter.write(str(i)) counter.write(str(i))
counter.close() counter.close()
def main():
cls()
# Comment in/out if you want 90% or full performance (not implemented)
cpuCount = int(os.cpu_count()) - 1
#cpuCount = 4
threads = []
for i in range(cpuCount):
print(i)
threads.append(Thread(target=imdbscrapper(i)))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

19
src/threads.py Normal file
View File

@@ -0,0 +1,19 @@
from threading import Thread
import os
import math
def calc():
for i in range(0, 4000000):
math.sqrt(i)
threads = []
for i in range(os.cpu_count()):
print('registering thread %d' % i)
threads.append(Thread(target=calc))
for thread in threads:
thread.start()
for thread in threads:
thread.join()

File diff suppressed because it is too large Load Diff