datacollector.py 4.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. import datetime
  2. import logging
  3. import os
  4. import pickle
  5. import time
  6. import zlib
  7. from collections import defaultdict
  8. from typing import Dict
  9. from gitstats.data.author import Author
  10. class DataCollector:
  11. """Manages data collection from a revision control repository."""
  12. def __init__(self, conf):
  13. self.conf = conf
  14. self.stamp_created = time.time()
  15. self.cache = {}
  16. self.total_authors = 0
  17. self.activity_by_hour_of_day = {} # hour -> commits
  18. self.activity_by_day_of_week = {} # day -> commits
  19. self.activity_by_month_of_year = {} # month [1-12] -> commits
  20. self.activity_by_hour_of_week = {} # weekday -> hour -> commits
  21. self.activity_by_hour_of_day_busiest = 0
  22. self.activity_by_hour_of_week_busiest = 0
  23. self.activity_by_year_week = {} # yy_wNN -> commits
  24. self.activity_by_year_week_peak = 0
  25. self.authors: Dict[Author] = defaultdict(lambda: Author())
  26. self.total_commits = 0
  27. self.total_files = 0
  28. self.authors_by_commits = 0
  29. # domains
  30. self.domains = {} # domain -> commits
  31. # author of the month
  32. self.author_of_month = {} # month -> author -> commits
  33. self.author_of_year = {} # year -> author -> commits
  34. self.commits_by_month = {} # month -> commits
  35. self.commits_by_year = {} # year -> commits
  36. self.lines_added_by_month = {} # month -> lines added
  37. self.lines_added_by_year = {} # year -> lines added
  38. self.lines_removed_by_month = {} # month -> lines removed
  39. self.lines_removed_by_year = {} # year -> lines removed
  40. self.first_commit_stamp = 0
  41. self.last_commit_stamp = 0
  42. self.last_active_day = None
  43. self.active_days = set()
  44. # lines
  45. self.total_lines = 0
  46. self.total_lines_added = 0
  47. self.total_lines_removed = 0
  48. # size
  49. self.total_size = 0
  50. # timezone
  51. self.commits_by_timezone = {} # timezone -> commits
  52. # tags
  53. self.tags = {}
  54. self.files_by_stamp = {} # stamp -> files
  55. # extensions
  56. self.extensions = {} # extension -> files, lines
  57. # line statistics
  58. self.changes_by_date = {} # stamp -> { files, ins, del }
  59. ##
  60. # This should be the main function to extract data from the repository.
  61. def collect(self, dir):
  62. self.dir = dir
  63. if len(self.conf['project_name']) == 0:
  64. self.projectname = os.path.basename(os.path.abspath(dir))
  65. else:
  66. self.projectname = self.conf['project_name']
  67. ##
  68. # Load cacheable data
  69. def loadCache(self, cachefile):
  70. if not os.path.exists(cachefile):
  71. return
  72. logging.info('Loading cache...')
  73. f = open(cachefile, 'rb')
  74. try:
  75. self.cache = pickle.loads(zlib.decompress(f.read()))
  76. except:
  77. # temporary hack to upgrade non-compressed caches
  78. f.seek(0)
  79. self.cache = pickle.load(f)
  80. f.close()
  81. ##
  82. # Produce any additional statistics from the extracted data.
  83. def refine(self):
  84. pass
  85. ##
  86. # : get a dictionary of author
  87. def getAuthorInfo(self, author):
  88. return None
  89. def getActivityByDayOfWeek(self):
  90. return {}
  91. def getActivityByHourOfDay(self):
  92. return {}
  93. # : get a dictionary of domains
  94. def getDomainInfo(self, domain):
  95. return None
  96. ##
  97. # Get a list of authors
  98. def getAuthors(self):
  99. return []
  100. def getFirstCommitDate(self):
  101. return datetime.datetime.now()
  102. def getLastCommitDate(self):
  103. return datetime.datetime.now()
  104. def getStampCreated(self):
  105. return self.stamp_created
  106. def getTags(self):
  107. return []
  108. def getTotalAuthors(self):
  109. return -1
  110. def getTotalCommits(self):
  111. return -1
  112. def getTotalFiles(self):
  113. return -1
  114. def getTotalLOC(self):
  115. return -1
  116. ##
  117. # Save cacheable data
  118. def saveCache(self, cachefile):
  119. logging.info('Saving cache...')
  120. tempfile = cachefile + '.tmp'
  121. f = open(tempfile, 'wb')
  122. # pickle.dump(self.cache, f)
  123. data = zlib.compress(pickle.dumps(self.cache))
  124. f.write(data)
  125. f.close()
  126. try:
  127. os.remove(cachefile)
  128. except OSError:
  129. pass
  130. os.rename(tempfile, cachefile)