gen_author_data.py 3.4KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import csv
  2. import logging
  3. import os
  4. import re
  5. from gitstats import cli, cd
  6. from gitstats.miscfuncs import getlogrange, getpipeoutput, getstatsummarycounts
  7. from gitstats.data import AuthorRow
  8. def gen_author_data(conf, row_processor):
  9. '''
  10. Given a configuration, pull authorship information. For
  11. each author, callback to the row_processor passing an AuthorRow
  12. :param conf: configuration (mostly used for date limits)
  13. :param row_processor: function to receive the callback
  14. :return: None
  15. '''
  16. # DBG: git log --shortstat --date-order --pretty=format:"%H %at %aN" --since="2017-10-01" "HEAD"
  17. # Results are in the form of
  18. #
  19. # 3c16756701d264619db0b309f42ebdc713b29827 1522513256 Dan Rapp
  20. # 524ee0d32ffbbb8bb82966b769bbf7dbc1d87a68 1522480979 Michael Wright
  21. # 1 file changed, 6 insertions(+)
  22. #
  23. # If there are two (or more) lines,
  24. # The first line(s) is the merge to master or other branch
  25. # The last line is the commit on the branch
  26. lines = getpipeoutput(
  27. ['git log --shortstat --date-order --pretty=format:"%%H %%at %%aN" %s' % (
  28. getlogrange(conf, 'HEAD'))]).split('\n')
  29. lines.reverse()
  30. files = 0
  31. inserted = 0
  32. deleted = 0
  33. stamp = 0
  34. for line in lines:
  35. if len(line) == 0:
  36. continue
  37. # <stamp> <author>
  38. if re.search('files? changed', line) is None:
  39. if files + inserted + deleted > 0: # this case indicates we've already processed the line
  40. pos = line.find(' ')
  41. if pos != -1:
  42. try:
  43. oldstamp = stamp
  44. tokens = line.split()
  45. sha = tokens[0]
  46. stamp = int(tokens[1])
  47. author = ' '.join(tokens[2:])
  48. if oldstamp > stamp:
  49. # clock skew, keep old timestamp to avoid having ugly graph
  50. stamp = oldstamp
  51. row_processor(AuthorRow(sha, stamp, author, files, inserted, deleted))
  52. # Since subsequent lines are (generally) reflections of merging into a branch
  53. # don't provide "credit" to the author did the merge
  54. (files, inserted, deleted) = 0, 0, 0
  55. except ValueError:
  56. logging.warning(f'unexpected line "{line}')
  57. else:
  58. logging.warning(f'unexpected line "{line}')
  59. else:
  60. numbers = getstatsummarycounts(line)
  61. if len(numbers) == 3:
  62. (files, inserted, deleted) = map(lambda el: int(el), numbers)
  63. else:
  64. logging.warning(f'Failed to handle line "{line}"')
  65. (files, inserted, deleted) = (0, 0, 0)
  66. if __name__ == "__main__":
  67. conf, paths, outputpath = cli.get_cli()
  68. with open(outputpath, 'w', encoding='utf8') as f:
  69. writer = csv.writer(f)
  70. writer.writerow(['repo', 'sha', 'stamp', 'author', 'files changed', 'lines inserted', 'lines deleted'])
  71. for path in paths:
  72. repo_name = os.path.split(path)[1]
  73. with (cd.cd(path)):
  74. gen_author_data(
  75. conf,
  76. lambda row: writer.writerow([repo_name, row.sha, row.stamp, row.author, row.files_modified,
  77. row.lines_inserted, row.lines_deleted]))