Selaa lähdekoodia

Fix performance issue for huge repositories

Problem: gitstats will read every commit and every file in repository in one
thread during initial statistics generation (i.e. no cache available). It may
take much time in case of huge repositories (100 000+ files) Solution: Execute
all read commands in 24 threads instead of one

Signed-off-by: Heikki Hokkanen <hoxu@users.sf.net>
Andrey Devyatkin 13 vuotta sitten
vanhempi
commit
8647c75d48
1 muutettua tiedostoa jossa 64 lisäystä ja 31 poistoa
  1. 64
    31
      gitstats

+ 64
- 31
gitstats Näytä tiedosto

@@ -14,6 +14,12 @@ import sys
14 14
 import time
15 15
 import zlib
16 16
 
17
+if sys.version_info < (2, 6):
18
+       print >> sys.stderr, "Python 2.6 or higher is required for gitstats"
19
+       sys.exit(1)
20
+
21
+from multiprocessing import Pool
22
+
17 23
 os.environ['LC_ALL'] = 'C'
18 24
 
19 25
 GNUPLOT_COMMON = 'set terminal png transparent size 640,240\nset size 1.0,1.0\n'
@@ -104,6 +110,20 @@ def getgitversion():
104 110
 def getgnuplotversion():
105 111
 	return getpipeoutput(['%s --version' % gnuplot_cmd]).split('\n')[0]
106 112
 
113
+def getnumoffilesfromrev(time_rev):
114
+	"""
115
+	Get number of files changed in commit
116
+	"""
117
+	time, rev = time_rev
118
+	return (int(time), rev, int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0]))
119
+
120
+def getnumoflinesinblob(ext_blob):
121
+	"""
122
+	Get number of lines in blob
123
+	"""
124
+	ext, blob_id = ext_blob
125
+	return (ext, blob_id, int(getpipeoutput(['git cat-file blob %s' % blob_id, 'wc -l']).split()[0]))
126
+
107 127
 class DataCollector:
108 128
 	"""Manages data collection from a revision control repository."""
109 129
 	def __init__(self):
@@ -408,14 +428,34 @@ class GitDataCollector(DataCollector):
408 428
 			# timezone
409 429
 			self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
410 430
 
411
-		# TODO Optimize this, it's the worst bottleneck
412 431
 		# outputs "<stamp> <files>" for each revision
413 432
 		revlines = getpipeoutput(['git rev-list --pretty=format:"%%at %%T" %s' % getcommitrange('HEAD'), 'grep -v ^commit']).strip().split('\n')
414 433
 		lines = []
434
+		revs_to_read = []
435
+		time_rev_count = []
436
+		#Look up rev in cache and take info from cache if found
437
+		#If not append rev to list of rev to read from repo
415 438
 		for revline in revlines:
416 439
 			time, rev = revline.split(' ')
417
-			linecount = self.getFilesInCommit(rev)
418
-			lines.append('%d %d' % (int(time), linecount))
440
+			#if cache empty then add time and rev to list of new rev's
441
+			#otherwise try to read needed info from cache
442
+			if 'files_in_tree' not in self.cache.keys():
443
+				revs_to_read.append((time,rev))
444
+				continue
445
+			if rev in self.cache['files_in_tree'].keys():
446
+				lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
447
+			else:
448
+				revs_to_read.append((time,rev))
449
+
450
+		#Read revisions from repo
451
+		time_rev_count = Pool(processes=24).map(getnumoffilesfromrev, revs_to_read)
452
+
453
+		#Update cache with new revisions and append then to general list
454
+		for (time, rev, count) in time_rev_count:
455
+			if 'files_in_tree' not in self.cache:
456
+				self.cache['files_in_tree'] = {}
457
+			self.cache['files_in_tree'][rev] = count
458
+			lines.append('%d %d' % (int(time), count))
419 459
 
420 460
 		self.total_commits += len(lines)
421 461
 		for line in lines:
@@ -430,6 +470,7 @@ class GitDataCollector(DataCollector):
430 470
 
431 471
 		# extensions and size of files
432 472
 		lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange('HEAD', end_only = True)]).split('\000')
473
+		blobs_to_read = []
433 474
 		for line in lines:
434 475
 			if len(line) == 0:
435 476
 				continue
@@ -437,7 +478,7 @@ class GitDataCollector(DataCollector):
437 478
 			if parts[0] == '160000' and parts[3] == '-':
438 479
 				# skip submodules
439 480
 				continue
440
-			sha1 = parts[2]
481
+			blob_id = parts[2]
441 482
 			size = int(parts[3])
442 483
 			fullpath = parts[4]
443 484
 
@@ -451,15 +492,28 @@ class GitDataCollector(DataCollector):
451 492
 				ext = filename[(filename.rfind('.') + 1):]
452 493
 			if len(ext) > conf['max_ext_length']:
453 494
 				ext = ''
454
-
455 495
 			if ext not in self.extensions:
456 496
 				self.extensions[ext] = {'files': 0, 'lines': 0}
457
-
458 497
 			self.extensions[ext]['files'] += 1
459
-			try:
460
-				self.extensions[ext]['lines'] += self.getLinesInBlob(sha1)
461
-			except:
462
-				print 'Warning: Could not count lines for file "%s"' % line
498
+			#if cache empty then add ext and blob id to list of new blob's
499
+			#otherwise try to read needed info from cache
500
+			if 'lines_in_blob' not in self.cache.keys():
501
+				blobs_to_read.append((ext,blob_id))
502
+				continue
503
+			if blob_id in self.cache['lines_in_blob'].keys():
504
+				self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
505
+			else:
506
+				blobs_to_read.append((ext,blob_id))
507
+
508
+		#Get info abount line count for new blob's that wasn't found in cache
509
+		ext_blob_linecount = Pool(processes=24).map(getnumoflinesinblob, blobs_to_read)
510
+
511
+		#Update cache and write down info about number of number of lines
512
+		for (ext, blob_id, linecount) in ext_blob_linecount:
513
+			if 'lines_in_blob' not in self.cache:
514
+				self.cache['lines_in_blob'] = {}
515
+			self.cache['lines_in_blob'][blob_id] = linecount
516
+			self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
463 517
 
464 518
 		# line statistics
465 519
 		# outputs:
@@ -619,33 +673,12 @@ class GitDataCollector(DataCollector):
619 673
 	def getDomains(self):
620 674
 		return self.domains.keys()
621 675
 	
622
-	def getFilesInCommit(self, rev):
623
-		try:
624
-			res = self.cache['files_in_tree'][rev]
625
-		except:
626
-			res = int(getpipeoutput(['git ls-tree -r --name-only "%s"' % rev, 'wc -l']).split('\n')[0])
627
-			if 'files_in_tree' not in self.cache:
628
-				self.cache['files_in_tree'] = {}
629
-			self.cache['files_in_tree'][rev] = res
630
-
631
-		return res
632
-
633 676
 	def getFirstCommitDate(self):
634 677
 		return datetime.datetime.fromtimestamp(self.first_commit_stamp)
635 678
 	
636 679
 	def getLastCommitDate(self):
637 680
 		return datetime.datetime.fromtimestamp(self.last_commit_stamp)
638 681
 	
639
-	def getLinesInBlob(self, sha1):
640
-		try:
641
-			res = self.cache['lines_in_blob'][sha1]
642
-		except:
643
-			res = int(getpipeoutput(['git cat-file blob %s' % sha1, 'wc -l']).split()[0])
644
-			if 'lines_in_blob' not in self.cache:
645
-				self.cache['lines_in_blob'] = {}
646
-			self.cache['lines_in_blob'][sha1] = res
647
-		return res
648
-
649 682
 	def getTags(self):
650 683
 		lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3'])
651 684
 		return lines.split('\n')