浏览代码

Fix performance issue for huge repositories

Problem: gitstats will read every commit and every file in repository in one
thread during initial statistics generation (i.e. no cache available). It may
take much time in case of huge repositories (100 000+ files) Solution: Execute
all read commands in 24 threads instead of one

Signed-off-by: Heikki Hokkanen <hoxu@users.sf.net>
Andrey Devyatkin 13 年前
父节点
当前提交
8647c75d48
共有 1 个文件被更改,包括 64 次插入31 次删除
  1. 64
    31
      gitstats

+ 64
- 31
gitstats 查看文件

14
 import time
14
 import time
15
 import zlib
15
 import zlib
16
 
16
 
17
+if sys.version_info < (2, 6):
18
+       print >> sys.stderr, "Python 2.6 or higher is required for gitstats"
19
+       sys.exit(1)
20
+
21
+from multiprocessing import Pool
22
+
17
 os.environ['LC_ALL'] = 'C'
23
 os.environ['LC_ALL'] = 'C'
18
 
24
 
19
 GNUPLOT_COMMON = 'set terminal png transparent size 640,240\nset size 1.0,1.0\n'
25
 GNUPLOT_COMMON = 'set terminal png transparent size 640,240\nset size 1.0,1.0\n'
104
 def getgnuplotversion():
110
 def getgnuplotversion():
105
 	return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]
111
 	return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]
106
 
112
 
113
+def getnumoffilesfromrev(time_rev):
114
+	"""
115
+	Get number of files changed in commit
116
+	"""
117
+	time, rev = time_rev
118
+	return (int(time), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0]))
119
+
120
+def getnumoflinesinblob(ext_blob):
121
+	"""
122
+	Get number of lines in blob
123
+	"""
124
+	ext, blob_id = ext_blob
125
+	return (ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0]))
126
+
107
 class DataCollector:
127
 class DataCollector:
108
 	"""Manages data collection from a revision control repository."""
128
 	"""Manages data collection from a revision control repository."""
109
 	def __init__(self):
129
 	def __init__(self):
408
 			# timezone
428
 			# timezone
409
 			self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
429
 			self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
410
 
430
 
411
-		# TODO Optimize this, it's the worst bottleneck
412
 		# outputs "<stamp> <files>" for each revision
431
 		# outputs "<stamp> <files>" for each revision
413
 		revlines = getpipeoutput(['git rev-list --pretty=format:"%%at %%T" %s' % getcommitrange('HEAD'), 'grep -v ^commit']).strip().split('\n')
432
 		revlines = getpipeoutput(['git rev-list --pretty=format:"%%at %%T" %s' % getcommitrange('HEAD'), 'grep -v ^commit']).strip().split('\n')
414
 		lines = []
433
 		lines = []
434
+		revs_to_read = []
435
+		time_rev_count = []
436
+		#Look up rev in cache and take info from cache if found
437
+		#If not append rev to list of rev to read from repo
415
 		for revline in revlines:
438
 		for revline in revlines:
416
 			time, rev = revline.split(' ')
439
 			time, rev = revline.split(' ')
417
-			linecount = self.getFilesInCommit(rev)
418
-			lines.append('%d %d' % (int(time), linecount))
440
+			#if cache empty then add time and rev to list of new rev's
441
+			#otherwise try to read needed info from cache
442
+			if 'files_in_tree' not in self.cache.keys():
443
+				revs_to_read.append((time,rev))
444
+				continue
445
+			if rev in self.cache['files_in_tree'].keys():
446
+				lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
447
+			else:
448
+				revs_to_read.append((time,rev))
449
+
450
+		#Read revisions from repo
451
+		time_rev_count = Pool(processes=24).map(getnumoffilesfromrev, revs_to_read)
452
+
453
+		#Update cache with new revisions and append then to general list
454
+		for (time, rev, count) in time_rev_count:
455
+			if 'files_in_tree' not in self.cache:
456
+				self.cache['files_in_tree'] = {}
457
+			self.cache['files_in_tree'][rev] = count
458
+			lines.append('%d %d' % (int(time), count))
419
 
459
 
420
 		self.total_commits += len(lines)
460
 		self.total_commits += len(lines)
421
 		for line in lines:
461
 		for line in lines:
430
 
470
 
431
 		# extensions and size of files
471
 		# extensions and size of files
432
 		lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange('HEAD', end_only = True)]).split('\000')
472
 		lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange('HEAD', end_only = True)]).split('\000')
473
+		blobs_to_read = []
433
 		for line in lines:
474
 		for line in lines:
434
 			if len(line) == 0:
475
 			if len(line) == 0:
435
 				continue
476
 				continue
437
 			if parts[0] == '160000' and parts[3] == '-':
478
 			if parts[0] == '160000' and parts[3] == '-':
438
 				# skip submodules
479
 				# skip submodules
439
 				continue
480
 				continue
440
-			sha1 = parts[2]
481
+			blob_id = parts[2]
441
 			size = int(parts[3])
482
 			size = int(parts[3])
442
 			fullpath = parts[4]
483
 			fullpath = parts[4]
443
 
484
 
451
 				ext = filename[(filename.rfind('.') + 1):]
492
 				ext = filename[(filename.rfind('.') + 1):]
452
 			if len(ext) > conf['max_ext_length']:
493
 			if len(ext) > conf['max_ext_length']:
453
 				ext = ''
494
 				ext = ''
454
-
455
 			if ext not in self.extensions:
495
 			if ext not in self.extensions:
456
 				self.extensions[ext] = {'files': 0, 'lines': 0}
496
 				self.extensions[ext] = {'files': 0, 'lines': 0}
457
-
458
 			self.extensions[ext]['files'] += 1
497
 			self.extensions[ext]['files'] += 1
459
-			try:
460
-				self.extensions[ext]['lines'] += self.getLinesInBlob(sha1)
461
-			except:
462
-				print 'Warning: Could not count lines for file "%s"' % line
498
+			#if cache empty then add ext and blob id to list of new blob's
499
+			#otherwise try to read needed info from cache
500
+			if 'lines_in_blob' not in self.cache.keys():
501
+				blobs_to_read.append((ext,blob_id))
502
+				continue
503
+			if blob_id in self.cache['lines_in_blob'].keys():
504
+				self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
505
+			else:
506
+				blobs_to_read.append((ext,blob_id))
507
+
508
+		#Get info abount line count for new blob's that wasn't found in cache
509
+		ext_blob_linecount = Pool(processes=24).map(getnumoflinesinblob, blobs_to_read)
510
+
511
+		#Update cache and write down info about number of number of lines
512
+		for (ext, blob_id, linecount) in ext_blob_linecount:
513
+			if 'lines_in_blob' not in self.cache:
514
+				self.cache['lines_in_blob'] = {}
515
+			self.cache['lines_in_blob'][blob_id] = linecount
516
+			self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
463
 
517
 
464
 		# line statistics
518
 		# line statistics
465
 		# outputs:
519
 		# outputs:
619
 	def getDomains(self):
673
 	def getDomains(self):
620
 		return self.domains.keys()
674
 		return self.domains.keys()
621
 	
675
 	
622
-	def getFilesInCommit(self, rev):
623
-		try:
624
-			res = self.cache['files_in_tree'][rev]
625
-		except:
626
-			res = int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0])
627
-			if 'files_in_tree' not in self.cache:
628
-				self.cache['files_in_tree'] = {}
629
-			self.cache['files_in_tree'][rev] = res
630
-
631
-		return res
632
-
633
 	def getFirstCommitDate(self):
676
 	def getFirstCommitDate(self):
634
 		return datetime.datetime.fromtimestamp(self.first_commit_stamp)
677
 		return datetime.datetime.fromtimestamp(self.first_commit_stamp)
635
 	
678
 	
636
 	def getLastCommitDate(self):
679
 	def getLastCommitDate(self):
637
 		return datetime.datetime.fromtimestamp(self.last_commit_stamp)
680
 		return datetime.datetime.fromtimestamp(self.last_commit_stamp)
638
 	
681
 	
639
-	def getLinesInBlob(self, sha1):
640
-		try:
641
-			res = self.cache['lines_in_blob'][sha1]
642
-		except:
643
-			res = int(getpipeoutput(['git cat-file blob %s' % sha1, 'wc -l']).split()[0])
644
-			if 'lines_in_blob' not in self.cache:
645
-				self.cache['lines_in_blob'] = {}
646
-			self.cache['lines_in_blob'][sha1] = res
647
-		return res
648
-
649
 	def getTags(self):
682
 	def getTags(self):
650
 		lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3'])
683
 		lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3'])
651
 		return lines.split('\n')
684
 		return lines.split('\n')