Started multithreadding POC

2026-01-15 12:15:12 +00:00 · 2021-05-03 02:06:12 +02:00
parent b1984e1fdf
commit f5cb768a65
4 changed files with 94 additions and 43778 deletions
--- a/src/processes.py
+++ b/src/processes.py
@@ -0,0 +1,19 @@
+from multiprocessing import Process
+import os
+import math
+
+def calc():
+	for i in range(0, 70000000):
+		math.sqrt(i)
+
+processes = []
+
+for i in range(os.cpu_count()):
+	print('registering process %d' % i)
+	processes.append(Process(target=calc))
+
+for process in processes:
+	process.start()
+
+for process in processes:
+	process.join()
--- a/src/scrapper.py
+++ b/src/scrapper.py
@@ -4,24 +4,32 @@ import json
 import requests
 import csv
 import logging
+from threading import Thread
 from bs4 import BeautifulSoup

 def cls():
    os.system('cls' if os.name=='nt' else 'clear')

-def main():
-    cls()
-    baseURL     = "https://www.imdb.com/title/tt"       # Base URL for each title
-    startURL    = 9999999                               # Start Number
-    endURL      = 0                                     # Ending Number
-    debugLevel  = 40                                    # 20 will display Info messages, 40 errors
-    logFile     = "/opt/storage/info.log"               # Log output
-    counterFile = "/opt/storage/counter.txt"            # Which ID was last scanned
-    reCheckFile = "/opt/storage/recheck.txt"            # Which IDs to recheck
+def imdbscrapper(i):
+    cpuCount        = int(os.cpu_count()) - 1
+    baseURL         = "https://www.imdb.com/title/tt"       # Base URL for each title
+    startURL        = 9999999                               # Start Number
+    endURL          = 0                                     # Ending Number
+    debugLevel      = 40                                    # 20 will display Info messages, 40 errors
+    logFile         = "/opt/storage/info.log"               # Log output
+    counterFileBase = "/opt/storage/counter"            # Base for the counter file
+    counterFileExt  = ".txt"
+    #counterFile     = "/opt/storage/counter.txt"            # Which ID was last scanned
+    reCheckFile     = "/opt/storage/recheck.txt"            # Which IDs to recheck
+
+
+    startURL = int((startURL/cpuCount)*i)
+    endURL = int(i*(startURL/cpuCount)+1)


-    table = []
    try:
+        # Tries to read the value to continue from counterN.txt. On error defaults to StartURL
+        counterFile = counterFileBase + str(i) + counterFileExt
        counter = open(counterFile, "r")
        startURL = int(counter.read())
        counter.close()
@@ -91,6 +99,23 @@ def main():
            counter.write(str(i))
            counter.close()

+def main():
+    cls()
+    # Comment in/out if you want 90% or full performance (not implemented)
+    cpuCount        = int(os.cpu_count()) - 1
+    #cpuCount        = 4
+
+    threads = []
+    for i in range(cpuCount):
+        print(i)
+        threads.append(Thread(target=imdbscrapper(i)))
+
+    for thread in threads:
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+

 if __name__ == "__main__":
    main()
--- a/src/threads.py
+++ b/src/threads.py
@@ -0,0 +1,19 @@
+from threading import Thread
+import os
+import math
+
+def calc():
+	for i in range(0, 4000000):
+		math.sqrt(i)
+
+threads = []
+
+for i in range(os.cpu_count()):
+	print('registering thread %d' % i)
+	threads.append(Thread(target=calc))
+
+for thread in threads:
+	thread.start()
+
+for thread in threads:
+	thread.join()
--- a/storage/recheck.txt
+++ b/storage/recheck.txt