Started multithreadding POC

2026-01-15 12:15:12 +00:00 · 2021-05-03 02:06:12 +02:00
parent b1984e1fdf
commit f5cb768a65
4 changed files with 94 additions and 43778 deletions
--- a/src/processes.py
+++ b/src/processes.py
@@ -0,0 +1,19 @@
 from multiprocessing import Process
 import os
 import math
 def calc():
 	for i in range(0, 70000000):
 		math.sqrt(i)
 processes = []
 for i in range(os.cpu_count()):
 	print('registering process %d' % i)
 	processes.append(Process(target=calc))
 for process in processes:
 	process.start()
 for process in processes:
 	process.join()
--- a/src/scrapper.py
+++ b/src/scrapper.py
@@ -4,24 +4,32 @@ import json
 import requests
 import csv
 import logging
 from threading import Thread
 from bs4 import BeautifulSoup
 def cls():
    os.system('cls' if os.name=='nt' else 'clear')
-def main():
+def imdbscrapper(i):
-    cls()
+    cpuCount        = int(os.cpu_count()) - 1
-    baseURL     = "https://www.imdb.com/title/tt"       # Base URL for each title
+    baseURL         = "https://www.imdb.com/title/tt"       # Base URL for each title
-    startURL    = 9999999                               # Start Number
+    startURL        = 9999999                               # Start Number
-    endURL      = 0                                     # Ending Number
+    endURL          = 0                                     # Ending Number
-    debugLevel  = 40                                    # 20 will display Info messages, 40 errors
+    debugLevel      = 40                                    # 20 will display Info messages, 40 errors
-    logFile     = "/opt/storage/info.log"               # Log output
+    logFile         = "/opt/storage/info.log"               # Log output
-    counterFile = "/opt/storage/counter.txt"            # Which ID was last scanned
+    counterFileBase = "/opt/storage/counter"            # Base for the counter file
-    reCheckFile = "/opt/storage/recheck.txt"            # Which IDs to recheck
+    counterFileExt  = ".txt"
    #counterFile     = "/opt/storage/counter.txt"            # Which ID was last scanned
    reCheckFile     = "/opt/storage/recheck.txt"            # Which IDs to recheck
    startURL = int((startURL/cpuCount)*i)
    endURL = int(i*(startURL/cpuCount)+1)
    table = []
    try:
        # Tries to read the value to continue from counterN.txt. On error defaults to StartURL
        counterFile = counterFileBase + str(i) + counterFileExt
        counter = open(counterFile, "r")
        startURL = int(counter.read())
        counter.close()
@@ -91,6 +99,23 @@ def main():
            counter.write(str(i))
            counter.close()
 def main():
    cls()
    # Comment in/out if you want 90% or full performance (not implemented)
    cpuCount        = int(os.cpu_count()) - 1
    #cpuCount        = 4
    threads = []
    for i in range(cpuCount):
        print(i)
        threads.append(Thread(target=imdbscrapper(i)))
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
 if __name__ == "__main__":
    main()
--- a/src/threads.py
+++ b/src/threads.py
@@ -0,0 +1,19 @@
 from threading import Thread
 import os
 import math
 def calc():
 	for i in range(0, 4000000):
 		math.sqrt(i)
 threads = []
 for i in range(os.cpu_count()):
 	print('registering thread %d' % i)
 	threads.append(Thread(target=calc))
 for thread in threads:
 	thread.start()
 for thread in threads:
 	thread.join()
--- a/storage/recheck.txt
+++ b/storage/recheck.txt