Python multithreaded calculation file MD5

python novice, wrote a program to calculate the file md5, can run through. Then I got confused when I tried to transform it into multithreading. For two-core machines, the calculation time spent on dual-threading is the same as that of not using multi-threading, which is caused by some problem. There are 44 files in the test environment, each file 200MB-400MB varies, with a total size of 12.1 GB, and the running time of both pieces of code is about 42 seconds.
then tried to use multiple processes, but the processing time did not improve

.

not using multithreaded code:

-sharp!/usr/bin/python3
import os, hashlib, binascii, pymysql, time, json, datetime

def listFiles(dir):
    paths = []
    for root,dirs,files in os.walk(dir):
        for file in files:
            paths.append(os.path.join(root,file))
            
    return paths
    
def calcMD5(filePath, block_size=2**20):
    md5 = hashlib.md5()
    f = open(filePath, "rb")
    while True:
        data = f.read(block_size)
        if not data:
            break
        md5.update(data)
    f.close()
    return md5.hexdigest()
        
files = listFiles("/data/S01")

result = []

startTime = datetime.datetime.now()

for i in files:
    fileMD5 = calcMD5(i)
    result.append(fileMD5)

print(result)

endTime = datetime.datetime.now()
timeDiff = endTime - startTime
timeDiffSeconds = timeDiff.seconds
print("{0}{1}".format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))

use multithreaded code:

 -sharp!/usr/bin/python3
    import os, hashlib, binascii, pymysql, time, json, datetime, threading, queue
    
    def listFiles(dir):
        paths = []
        for root,dirs,files in os.walk(dir):
            for file in files:
                paths.append(os.path.join(root,file))
                
        return paths
        
            
    class threadMD5(threading.Thread):
        def __init__(self, queue):
            threading.Thread.__init__(self)
            self.queue = queue
        
        def run(self):
            while True:
                try:
                    filePath = self.queue.get(block=False)
                except Exception as e:
                    print("thread end")
                    break
                fileMD5 = calcMD5(filePath)
                
                self.queue.task_done()
    
    def calcMD5(filePath, block_size=2**20):
        md5 = hashlib.md5()
        f = open(filePath, "rb")
        while True:
            data = f.read(block_size)
            if not data:
                break
            md5.update(data)
        f.close()
        return md5.hexdigest()
        
    startTime = datetime.datetime.now()
    files = listFiles("/data/S01")
    
    result = []
    
    -sharp
    queue = queue.Queue()
    for i in files:
        queue.put(i, block=False)
    
    threads = []
    
    for i in range(2):
        t = threadMD5(queue)
        t.setDaemon(True)
        t.start()
        threads.append(t)
    
    for i in threads:
        i.join()
    
    print(result)
    
    endTime = datetime.datetime.now()
    timeDiff = endTime - startTime
    timeDiffSeconds = timeDiff.seconds
    print("{0}{1}".format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))
    
:

import os, hashlib, time, datetime
import multiprocessing as mp

results = []

def listFiles(dir):
    paths = []
    for root,dirs,files in os.walk(dir):
        for file in files:
            paths.append(os.path.join(root,file))
            
    return paths

def calcMD5(filePath, block_size=2**20):
    md5 = hashlib.md5()
    f = open(filePath, "rb")
    while True:
        data = f.read(block_size)
        if not data:
            break
        md5.update(data)
    f.close()
    return md5.hexdigest()

def collect_results(result):
    results.extend(result)

if __name__ == "__main__":
    p = mp.Pool(processes=2)
    files = listFiles("/data/S01")
    startTime = datetime.datetime.now()
    for f in files:
            p.apply_async(calcMD5, args=(f, ), callback=collect_results)
    p.close()
    p.join()
    print(results)
    
    endTime = datetime.datetime.now()
    timeDiff = endTime - startTime
    timeDiffSeconds = timeDiff.seconds
    print("{0}{1}".format(int(timeDiffSeconds/60), int(timeDiffSeconds%60)))

Mar.02,2021

seems to have been fooled by hyperv. It will be normal after changing the physical machine and vmware's virtual machine.


Baidu searches for python GIL

Menu