gitdatacollector.py 19KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. import datetime
  2. import logging
  3. import re
  4. import os
  5. from multiprocessing import Pool
  6. from .datacollector import DataCollector
  7. from .miscfuncs import getcommitrange, getkeyssortedbyvaluekey, getlogrange, getnumoffilesfromrev, getnumoflinesinblob, \
  8. getpipeoutput, getstatsummarycounts
  9. class GitDataCollector(DataCollector):
  10. def __init__(self, conf):
  11. super(GitDataCollector, self).__init__(conf)
  12. def collect(self, directory):
  13. super(GitDataCollector, self).collect(directory)
  14. self.total_authors += int(getpipeoutput(['git shortlog -s %s' % getlogrange(self.conf), 'wc -l']))
  15. # self.total_lines = int(getoutput('git-ls-files -z |xargs -0 cat |wc -l'))
  16. # tags
  17. lines = getpipeoutput(['git show-ref --tags']).split('\n')
  18. for line in lines:
  19. if len(line) == 0:
  20. continue
  21. (line_hash, tag) = line.split(' ')
  22. tag = tag.replace('refs/tags/', '')
  23. output = getpipeoutput(['git log "%s" --pretty=format:"%%at %%aN" -n 1' % line_hash])
  24. if len(output) > 0:
  25. parts = output.split(' ')
  26. try:
  27. stamp = int(parts[0])
  28. except ValueError:
  29. stamp = 0
  30. self.tags[tag] = {'stamp': stamp, 'hash': line_hash,
  31. 'date': datetime.datetime.fromtimestamp(stamp).strftime('%Y-%m-%d'), 'commits': 0,
  32. 'authors': {}}
  33. # collect info on tags, starting from latest
  34. tags_sorted_by_date_asc = [tup[1] for tup in sorted([(el[1]['date'], el[0]) for el in self.tags.items()])]
  35. # tags_sorted_by_date_desc = map(lambda el: el[1],
  36. # reversed(sorted(map(lambda el: (el[1]['date'], el[0]), self.tags.items()))))
  37. prev = None
  38. # for tag in reversed(tags_sorted_by_date_desc):
  39. for tag in tags_sorted_by_date_asc:
  40. cmd = 'git shortlog -s "%s"' % tag
  41. if prev is not None:
  42. cmd += ' "^%s"' % prev
  43. output = getpipeoutput([cmd])
  44. if len(output) == 0:
  45. continue
  46. prev = tag
  47. for line in output.split('\n'):
  48. parts = re.split('\s+', line, 2)
  49. commits = int(parts[1])
  50. author = parts[2]
  51. self.tags[tag]['commits'] += commits
  52. self.tags[tag]['authors'][author] = commits
  53. # Collect revision statistics
  54. # Outputs "<stamp> <date> <time> <timezone> <author> '<' <mail> '>'"
  55. lines = getpipeoutput(
  56. ['git rev-list --pretty=format:"%%at %%ai %%aN <%%aE>" %s' % getlogrange(self.conf, 'HEAD'), 'grep -v ^commit']).split(
  57. '\n')
  58. for line in lines:
  59. parts = line.split(' ', 4)
  60. try:
  61. stamp = int(parts[0])
  62. except ValueError:
  63. stamp = 0
  64. timezone = parts[3]
  65. author, mail = parts[4].split('<', 1)
  66. author = author.rstrip()
  67. mail = mail.rstrip('>')
  68. domain = '?'
  69. if mail.find('@') != -1:
  70. domain = mail.rsplit('@', 1)[1]
  71. date = datetime.datetime.fromtimestamp(float(stamp))
  72. # First and last commit stamp (may be in any order because of cherry-picking and patches)
  73. if stamp > self.last_commit_stamp:
  74. self.last_commit_stamp = stamp
  75. if self.first_commit_stamp == 0 or stamp < self.first_commit_stamp:
  76. self.first_commit_stamp = stamp
  77. # activity
  78. # hour
  79. hour = date.hour
  80. self.activity_by_hour_of_day[hour] = self.activity_by_hour_of_day.get(hour, 0) + 1
  81. # most active hour?
  82. if self.activity_by_hour_of_day[hour] > self.activity_by_hour_of_day_busiest:
  83. self.activity_by_hour_of_day_busiest = self.activity_by_hour_of_day[hour]
  84. # day of week
  85. day = date.weekday()
  86. self.activity_by_day_of_week[day] = self.activity_by_day_of_week.get(day, 0) + 1
  87. # domain stats
  88. if domain not in self.domains:
  89. self.domains[domain] = {}
  90. # commits
  91. self.domains[domain]['commits'] = self.domains[domain].get('commits', 0) + 1
  92. # hour of week
  93. if day not in self.activity_by_hour_of_week:
  94. self.activity_by_hour_of_week[day] = {}
  95. self.activity_by_hour_of_week[day][hour] = self.activity_by_hour_of_week[day].get(hour, 0) + 1
  96. # most active hour?
  97. if self.activity_by_hour_of_week[day][hour] > self.activity_by_hour_of_week_busiest:
  98. self.activity_by_hour_of_week_busiest = self.activity_by_hour_of_week[day][hour]
  99. # month of year
  100. month = date.month
  101. self.activity_by_month_of_year[month] = self.activity_by_month_of_year.get(month, 0) + 1
  102. # yearly/weekly activity
  103. yyw = date.strftime('%Y-%W')
  104. self.activity_by_year_week[yyw] = self.activity_by_year_week.get(yyw, 0) + 1
  105. if self.activity_by_year_week_peak < self.activity_by_year_week[yyw]:
  106. self.activity_by_year_week_peak = self.activity_by_year_week[yyw]
  107. # author stats
  108. if author not in self.authors:
  109. self.authors[author] = {}
  110. # commits, note again that commits may be in any date order because of cherry-picking and patches
  111. if 'last_commit_stamp' not in self.authors[author]:
  112. self.authors[author]['last_commit_stamp'] = stamp
  113. if stamp > self.authors[author]['last_commit_stamp']:
  114. self.authors[author]['last_commit_stamp'] = stamp
  115. if 'first_commit_stamp' not in self.authors[author]:
  116. self.authors[author]['first_commit_stamp'] = stamp
  117. if stamp < self.authors[author]['first_commit_stamp']:
  118. self.authors[author]['first_commit_stamp'] = stamp
  119. # author of the month/year
  120. yymm = date.strftime('%Y-%m')
  121. if yymm in self.author_of_month:
  122. self.author_of_month[yymm][author] = self.author_of_month[yymm].get(author, 0) + 1
  123. else:
  124. self.author_of_month[yymm] = {}
  125. self.author_of_month[yymm][author] = 1
  126. self.commits_by_month[yymm] = self.commits_by_month.get(yymm, 0) + 1
  127. yy = date.year
  128. if yy in self.author_of_year:
  129. self.author_of_year[yy][author] = self.author_of_year[yy].get(author, 0) + 1
  130. else:
  131. self.author_of_year[yy] = {}
  132. self.author_of_year[yy][author] = 1
  133. self.commits_by_year[yy] = self.commits_by_year.get(yy, 0) + 1
  134. # authors: active days
  135. yymmdd = date.strftime('%Y-%m-%d')
  136. if 'last_active_day' not in self.authors[author]:
  137. self.authors[author]['last_active_day'] = yymmdd
  138. self.authors[author]['active_days'] = {yymmdd}
  139. elif yymmdd != self.authors[author]['last_active_day']:
  140. self.authors[author]['last_active_day'] = yymmdd
  141. self.authors[author]['active_days'].add(yymmdd)
  142. # project: active days
  143. if yymmdd != self.last_active_day:
  144. self.last_active_day = yymmdd
  145. self.active_days.add(yymmdd)
  146. # timezone
  147. self.commits_by_timezone[timezone] = self.commits_by_timezone.get(timezone, 0) + 1
  148. # outputs "<stamp> <files>" for each revision
  149. revlines = getpipeoutput(
  150. ['git rev-list --pretty=format:"%%at %%T" %s' % getlogrange(self.conf, 'HEAD'), 'grep -v ^commit']).strip().split('\n')
  151. lines = []
  152. revs_to_read = []
  153. # Look up rev in cache and take info from cache if found
  154. # If not append rev to list of rev to read from repo
  155. for revline in revlines:
  156. time, rev = revline.split(' ')
  157. # if cache empty then add time and rev to list of new rev's
  158. # otherwise try to read needed info from cache
  159. if 'files_in_tree' not in self.cache.keys():
  160. revs_to_read.append((time, rev))
  161. continue
  162. if rev in self.cache['files_in_tree'].keys():
  163. lines.append('%d %d' % (int(time), self.cache['files_in_tree'][rev]))
  164. else:
  165. revs_to_read.append((time, rev))
  166. # Read revisions from repo
  167. pool = Pool(processes=self.conf['processes'])
  168. time_rev_count = pool.map(getnumoffilesfromrev, revs_to_read)
  169. pool.terminate()
  170. pool.join()
  171. # Update cache with new revisions and append then to general list
  172. for (time, rev, count) in time_rev_count:
  173. if 'files_in_tree' not in self.cache:
  174. self.cache['files_in_tree'] = {}
  175. self.cache['files_in_tree'][rev] = count
  176. lines.append('%d %d' % (int(time), count))
  177. self.total_commits += len(lines)
  178. for line in lines:
  179. parts = line.split(' ')
  180. if len(parts) != 2:
  181. continue
  182. (stamp, files) = parts[0:2]
  183. try:
  184. self.files_by_stamp[int(stamp)] = int(files)
  185. except ValueError:
  186. logging.warning(f'Failed to parse line "{line}"')
  187. # extensions and size of files
  188. lines = getpipeoutput(['git ls-tree -r -l -z %s' % getcommitrange(self.conf, 'HEAD', end_only=True)]).split('\000')
  189. blobs_to_read = []
  190. for line in lines:
  191. if len(line) == 0:
  192. continue
  193. parts = re.split('\s+', line, 4)
  194. if parts[0] == '160000' and parts[3] == '-':
  195. # skip submodules
  196. continue
  197. blob_id = parts[2]
  198. size = int(parts[3])
  199. fullpath = parts[4]
  200. self.total_size += size
  201. self.total_files += 1
  202. _, ext = os.path.splitext(fullpath)
  203. if len(ext) > self.conf['max_ext_length']:
  204. ext = ''
  205. if ext not in self.extensions:
  206. self.extensions[ext] = {'files': 0, 'lines': 0}
  207. self.extensions[ext]['files'] += 1
  208. # if cache empty then add ext and blob id to list of new blob's
  209. # otherwise try to read needed info from cache
  210. if 'lines_in_blob' not in self.cache.keys():
  211. blobs_to_read.append((ext, blob_id))
  212. continue
  213. if blob_id in self.cache['lines_in_blob'].keys():
  214. self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
  215. else:
  216. blobs_to_read.append((ext, blob_id))
  217. # Get info abount line count for new blob's that wasn't found in cache
  218. pool = Pool(processes=self.conf['processes'])
  219. ext_blob_linecount = pool.map(getnumoflinesinblob, blobs_to_read)
  220. pool.terminate()
  221. pool.join()
  222. # Update cache and write down info about number of number of lines
  223. for (ext, blob_id, linecount) in ext_blob_linecount:
  224. if 'lines_in_blob' not in self.cache:
  225. self.cache['lines_in_blob'] = {}
  226. self.cache['lines_in_blob'][blob_id] = linecount
  227. self.extensions[ext]['lines'] += self.cache['lines_in_blob'][blob_id]
  228. # line statistics
  229. # outputs:
  230. # N files changed, N insertions (+), N deletions(-)
  231. # <stamp> <author>
  232. self.changes_by_date = {} # stamp -> { files, ins, del }
  233. # computation of lines of code by date is better done
  234. # on a linear history.
  235. extra = ''
  236. if self.conf['linear_linestats']:
  237. extra = '--first-parent -m'
  238. lines = getpipeoutput(
  239. ['git log --shortstat %s --pretty=format:"%%at %%aN" %s' % (extra, getlogrange(self.conf, 'HEAD'))]).split('\n')
  240. lines.reverse()
  241. files = 0
  242. inserted = 0
  243. deleted = 0
  244. total_lines = 0
  245. for line in lines:
  246. if len(line) == 0:
  247. continue
  248. # <stamp> <author>
  249. if re.search('files? changed', line) is None:
  250. pos = line.find(' ')
  251. if pos != -1:
  252. try:
  253. (stamp, author) = (int(line[:pos]), line[pos + 1:])
  254. self.changes_by_date[stamp] = {'files': files, 'ins': inserted, 'del': deleted,
  255. 'lines': total_lines}
  256. date = datetime.datetime.fromtimestamp(stamp)
  257. yymm = date.strftime('%Y-%m')
  258. self.lines_added_by_month[yymm] = self.lines_added_by_month.get(yymm, 0) + inserted
  259. self.lines_removed_by_month[yymm] = self.lines_removed_by_month.get(yymm, 0) + deleted
  260. yy = date.year
  261. self.lines_added_by_year[yy] = self.lines_added_by_year.get(yy, 0) + inserted
  262. self.lines_removed_by_year[yy] = self.lines_removed_by_year.get(yy, 0) + deleted
  263. files, inserted, deleted = 0, 0, 0
  264. except ValueError:
  265. logging.warning(f'unexpected line "{line}')
  266. else:
  267. logging.warning(f'unexpected line "{line}')
  268. else:
  269. numbers = getstatsummarycounts(line)
  270. if len(numbers) == 3:
  271. (files, inserted, deleted) = map(lambda el: int(el), numbers)
  272. total_lines += inserted
  273. total_lines -= deleted
  274. self.total_lines_added += inserted
  275. self.total_lines_removed += deleted
  276. else:
  277. logging.warning(f'Failed to handle line "{line}"')
  278. (files, inserted, deleted) = (0, 0, 0)
  279. # self.changes_by_date[stamp] = { 'files': files, 'ins': inserted, 'del': deleted }
  280. self.total_lines += total_lines
  281. # Per-author statistics
  282. # defined for stamp, author only if author commited at this timestamp.
  283. self.changes_by_date_by_author = {} # stamp -> author -> lines_added
  284. # Similar to the above, but never use --first-parent
  285. # (we need to walk through every commit to know who
  286. # committed what, not just through mainline)
  287. lines = getpipeoutput(
  288. ['git log --shortstat --date-order --pretty=format:"%%at %%aN" %s' % (getlogrange(self.conf, 'HEAD'))]).split('\n')
  289. lines.reverse()
  290. inserted = 0
  291. deleted = 0
  292. stamp = 0
  293. for line in lines:
  294. if len(line) == 0:
  295. continue
  296. # <stamp> <author>
  297. if re.search('files? changed', line) is None:
  298. pos = line.find(' ')
  299. if pos != -1:
  300. try:
  301. oldstamp = stamp
  302. (stamp, author) = (int(line[:pos]), line[pos + 1:])
  303. if oldstamp > stamp:
  304. # clock skew, keep old timestamp to avoid having ugly graph
  305. stamp = oldstamp
  306. if author not in self.authors:
  307. self.authors[author] = {'lines_added': 0, 'lines_removed': 0, 'commits': 0}
  308. self.authors[author]['commits'] = self.authors[author].get('commits', 0) + 1
  309. self.authors[author]['lines_added'] = self.authors[author].get('lines_added', 0) + inserted
  310. self.authors[author]['lines_removed'] = self.authors[author].get('lines_removed', 0) + deleted
  311. if stamp not in self.changes_by_date_by_author:
  312. self.changes_by_date_by_author[stamp] = {}
  313. if author not in self.changes_by_date_by_author[stamp]:
  314. self.changes_by_date_by_author[stamp][author] = {}
  315. self.changes_by_date_by_author[stamp][author]['lines_added'] = self.authors[author][
  316. 'lines_added']
  317. self.changes_by_date_by_author[stamp][author]['commits'] = self.authors[author]['commits']
  318. files, inserted, deleted = 0, 0, 0
  319. except ValueError:
  320. logging.warning(f'unexpected line "{line}')
  321. else:
  322. logging.warning(f'unexpected line "{line}')
  323. else:
  324. numbers = getstatsummarycounts(line)
  325. if len(numbers) == 3:
  326. (files, inserted, deleted) = map(lambda el: int(el), numbers)
  327. else:
  328. logging.warning(f'Failed to handle line "{line}"')
  329. (files, inserted, deleted) = (0, 0, 0)
  330. def refine(self):
  331. # authors
  332. # name -> {place_by_commits, commits_frac, date_first, date_last, timedelta}
  333. self.authors_by_commits = getkeyssortedbyvaluekey(self.authors, 'commits')
  334. self.authors_by_commits.reverse() # most first
  335. for i, name in enumerate(self.authors_by_commits):
  336. self.authors[name]['place_by_commits'] = i + 1
  337. for name in self.authors.keys():
  338. a = self.authors[name]
  339. a['commits_frac'] = (100 * float(a['commits'])) / self.getTotalCommits()
  340. date_first = datetime.datetime.fromtimestamp(a['first_commit_stamp'])
  341. date_last = datetime.datetime.fromtimestamp(a['last_commit_stamp'])
  342. delta = date_last - date_first
  343. a['date_first'] = date_first.strftime('%Y-%m-%d')
  344. a['date_last'] = date_last.strftime('%Y-%m-%d')
  345. a['timedelta'] = delta
  346. if 'lines_added' not in a:
  347. a['lines_added'] = 0
  348. if 'lines_removed' not in a:
  349. a['lines_removed'] = 0
  350. def getActiveDays(self):
  351. return self.active_days
  352. def getActivityByDayOfWeek(self):
  353. return self.activity_by_day_of_week
  354. def getActivityByHourOfDay(self):
  355. return self.activity_by_hour_of_day
  356. def getAuthorInfo(self, author):
  357. return self.authors[author]
  358. def getAuthors(self, limit=None):
  359. res = getkeyssortedbyvaluekey(self.authors, 'commits')
  360. res.reverse()
  361. return res[:limit]
  362. def getCommitDeltaDays(self):
  363. return (self.last_commit_stamp / 86400 - self.first_commit_stamp / 86400) + 1
  364. def getDomainInfo(self, domain):
  365. return self.domains[domain]
  366. def getDomains(self):
  367. return self.domains.keys()
  368. def getFirstCommitDate(self):
  369. return datetime.datetime.fromtimestamp(self.first_commit_stamp)
  370. def getLastCommitDate(self):
  371. return datetime.datetime.fromtimestamp(self.last_commit_stamp)
  372. def getTags(self):
  373. lines = getpipeoutput(['git show-ref --tags', 'cut -d/ -f3'])
  374. return lines.split('\n')
  375. def getTagDate(self, tag):
  376. return self.revToDate('tags/' + tag)
  377. def getTotalAuthors(self):
  378. return self.total_authors
  379. def getTotalCommits(self):
  380. return self.total_commits
  381. def getTotalFiles(self):
  382. return self.total_files
  383. def getTotalLOC(self):
  384. return self.total_lines
  385. def getTotalSize(self):
  386. return self.total_size
  387. def revToDate(self, rev):
  388. stamp = int(getpipeoutput(['git log --pretty=format:%%at "%s" -n 1' % rev]))
  389. return datetime.datetime.fromtimestamp(stamp).strftime('%Y-%m-%d')