Source code for gitlab2pandas.extractions

from typing import Union
import json
import sys
import threading
import queue
import pandas as pd
from gitlab2pandas.core import Core
from gitlab.exceptions import GitlabAuthenticationError

[docs]class Extractions(Core): """ Initializes extractions object with general information. Decide wheather to initialize with a project object or with the project namespace and name. Extractions can only be done with a project object or after connecting to a server with the project namespace and name. Parameters ---------- data_root_dir : str A existing top level directory for data extraction. project : Project, default=None Project object from gitlab. project_namespace : str, default=None Namespace of the project. project_name : str, default=None Name of the project. extract_parallel: bool, default=False Parallel extraction might fail for some GitLab Server because of server settings. """ EXTRACTIONS_WITHOUT_UPDATE = [ Core.Features.BRANCHES, Core.Features.ISSUE_BOARDS, Core.Features.LABELS, Core.Features.MILESTONES, Core.Features.PROJECTS, Core.Features.RELEASES, Core.Features.SNIPPETS, Core.Features.USERS, Core.Features.WIKIS, Core.Features.TRIGGERS ] def __init__(self, data_root_dir: str, project=None, project_namespace=None, project_name=None, extract_parallel=False) -> None: """ Initializes a Extractions object with general information. Decide wheather to initialize with a project object or with the project namespace and name. Extractions can only be started with a project object. ToDo: log_level=logging.INFO Parameters ---------- data_root_dir : str A existing top level directory for data extraction. project : Project, default=None Project object from gitlab. project_namespace : str, default=None Namespace of the project. project_name : str, default=None Name of the project. extract_parallel: bool, default=False Parallel extraction might fail for some GitLab Server because of server settings. """ super().__init__(data_root_dir, project, project_namespace, project_name) self.extract_parallel = extract_parallel self.data_queue = queue.Queue() self.consumer_thread = threading.Thread(target=self.__gitlab_data_consumer) self.log_queue = queue.Queue() self.log_serial_thread = threading.Thread(target=self.__log_serial_consumer) self.log_parallel_thread = threading.Thread(target=self.__log_parallel_consumer) self.use_feature_whitelist = None self.feature_list = [] self.update_date = None
[docs] def start(self, feature_blacklist:list = [], feature_whitelist:list = [], update:bool = True) -> None: """ Starts a extraction with a blacklist or whitelist for features. The extraction can start from the last commit date or the entire project. Parameters ---------- feature_blacklist : list, default=[] Features which will be ignored. feature_whitelist : list, default=[] Features which will be extracted. If its empty then all features are extracted which are not in the blacklist. update: bool, default=True Extract only new items after last extration. """ if self.project is None: raise Exception("Need a connection (project object)") if self.consumer_thread.is_alive(): raise Exception("Can not extract Data. There is already one extraction running") if feature_whitelist != []: self.use_feature_whitelist = True self.feature_list = feature_whitelist if feature_blacklist != []: print("Whitelist is used and Blacklist is ignored!") elif feature_blacklist != []: self.use_feature_whitelist = False self.feature_list = feature_blacklist else: self.use_feature_whitelist = None if update: # ToDo: Check projects atributes projects_df = self.get_pandas_data_frame(self.Features.PROJECTS) if projects_df is not None and not projects_df.empty: project_df = projects_df.loc[projects_df["path_with_namespace"] == self.project.attributes["path_with_namespace"]] if len(project_df) == 1: self.update_date = project_df.iloc[0]["last_activity_at"] else: print("there is redundant information in projects pandas file") self.consumer_thread.start() method_list = [method for method in dir(Extractions) if method.startswith('extract') is True] if self.extract_parallel: ### parallel ### self.log_parallel_thread.start() threads = [] for method in method_list: threads.append(threading.Thread(target=getattr(self,method), args=())) for thread in threads: thread.start() for thread in threads: thread.join() else: ### sequential ### self.log_serial_thread.start() for method in method_list: getattr(self,method)() self.data_queue.put((None,None)) self.consumer_thread.join() self.use_feature_whitelist = None self.feature_list = []
[docs] def pass_white_black_list(self, feature) -> bool: """ Checks if a feature passes the white- and blacklist. Parameters ---------- feature : str Feature to be checked. Returns ------- bool True if the feature can be extracted. False if the feature should be ignored. """ if self.use_feature_whitelist is None: return True elif self.use_feature_whitelist and feature in self.feature_list: return True elif not self.use_feature_whitelist and feature not in self.feature_list: return True return False
[docs] def extract_branches(self) -> None: """ Extracts branches from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.BRANCHES): return self.__gitlab_data_producer(self.project, Core.Features.BRANCHES, ["branches", "list"])
[docs] def extract_commits(self) -> None: """ Extracts commits and its sub features from GitLab. Check for update works. """ functions = ["commits", "list"] sub_functions = {} if self.pass_white_black_list(Core.Features.COMMITS_COMMENTS): sub_functions[Core.Features.COMMITS_COMMENTS] = ["comments", "list"] if self.pass_white_black_list(Core.Features.COMMITS_REFS): sub_functions[Core.Features.COMMITS_REFS] = ["refs"] if self.pass_white_black_list(Core.Features.COMMITS_DIFFS): sub_functions[Core.Features.COMMITS_DIFFS] = ["diff"] if self.pass_white_black_list(Core.Features.COMMITS_STATUSES): sub_functions[Core.Features.COMMITS_STATUSES] = ["statuses", "list"] if not self.pass_white_black_list(Core.Features.COMMITS): if not self.use_feature_whitelist: # If the feature is on the blacklist then ignore the feature and its subfeatures return if sub_functions == {}: # If the feature is not on the whitelist then ignore the feature only if no subfeature is on the whitelist return if sub_functions != {}: functions = { "attr": functions, "sub_functions": sub_functions } self.__gitlab_data_producer(self.project, Core.Features.COMMITS, functions)
[docs] def extract_events(self) -> None: """ Extracts events from GitLab. Check for update works. """ if not self.pass_white_black_list(Core.Features.EVENTS): return self.__gitlab_data_producer(self.project, Core.Features.EVENTS, ["events", "list"])
[docs] def extract_issues(self) -> None: """ Extracts issues and its sub features from GitLab. Check for update works. """ functions = ["issues", "list"] sub_functions = {} # ignored ["time_stats"] --> already in issue # ignored ["participants"] --> already in issue if self.pass_white_black_list(Core.Features.ISSUES_NOTES_AWARD_EMOJIS): if not self.use_feature_whitelist is False or Core.Features.ISSUES_NOTES not in self.feature_list: # If there is not a blacklist or notes are not on the blacklist then add notes award emojis. sub_functions[Core.Features.ISSUES_NOTES] = { "attr": ["notes", "list"], "sub_functions": {Core.Features.ISSUES_NOTES_AWARD_EMOJIS: ["awardemojis", "list"]} } elif self.pass_white_black_list(Core.Features.ISSUES_NOTES): sub_functions[Core.Features.ISSUES_NOTES] = ["notes", "list"] if self.pass_white_black_list(Core.Features.ISSUES_AWARD_EMOJIS): sub_functions[Core.Features.ISSUES_AWARD_EMOJIS] = ["awardemojis", "list"] if self.pass_white_black_list(Core.Features.ISSUES_RESOURCESTATEEVENTS): sub_functions[Core.Features.ISSUES_RESOURCESTATEEVENTS] = ["resourcestateevents", "list"] if self.pass_white_black_list(Core.Features.ISSUES_RESOURCELABELEVENTS): sub_functions[Core.Features.ISSUES_RESOURCELABELEVENTS] = ["resourcelabelevents", "list"] if self.pass_white_black_list(Core.Features.ISSUES_CLOSED_BY_MR): sub_functions[Core.Features.ISSUES_CLOSED_BY_MR] = ["closed_by"] if self.pass_white_black_list(Core.Features.ISSUES_RELATED_MR): sub_functions[Core.Features.ISSUES_RELATED_MR] = ["related_merge_requests"] if self.pass_white_black_list(Core.Features.ISSUES_LINKS): sub_functions[Core.Features.ISSUES_LINKS] = ["links", "list"] if self.pass_white_black_list(Core.Features.ISSUES_RESOURCEMILESTONESEVENTS): sub_functions[Core.Features.ISSUES_RESOURCEMILESTONESEVENTS] = ["resourcemilestoneevents", "list"] if not self.pass_white_black_list(Core.Features.ISSUES): if not self.use_feature_whitelist: # If the feature is on the blacklist then ignore the feature and its subfeatures return if sub_functions == {}: # If the feature is not on the whitelist then ignore the feature only if no subfeature is on the whitelist return if sub_functions != {}: functions = { "attr": functions, "sub_functions": sub_functions } self.__gitlab_data_producer(self.project, Core.Features.ISSUES, functions)
[docs] def extract_issue_boards(self) -> None: """ Extracts issue boards from GitLab. Check for update does not work. """ functions = ["boards", "list"] sub_functions = {} if self.pass_white_black_list(Core.Features.ISSUE_BOARDS_LISTS): sub_functions[Core.Features.ISSUE_BOARDS_LISTS] = ["lists", "list"] if not self.pass_white_black_list(Core.Features.ISSUE_BOARDS): if not self.use_feature_whitelist: # If the feature is on the blacklist then ignore the feature and its subfeatures return if sub_functions == {}: # If the feature is not on the whitelist then ignore the feature only if no subfeature is on the whitelist return if sub_functions != {}: functions = { "attr": functions, "sub_functions": sub_functions } self.__gitlab_data_producer(self.project, Core.Features.ISSUE_BOARDS, functions)
[docs] def extract_labels(self) -> None: """ Extracts labels from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.LABELS): return self.__gitlab_data_producer(self.project, Core.Features.LABELS, ["labels", "list"])
[docs] def extract_merge_requests(self) -> None: """ Extracts merge requests and its sub features from GitLab. Check for update works. """ functions = ["mergerequests", "list"] sub_functions = {} # ignore ["pipelines", "list"] --> pipelines can be matched via commit sha # ignored ["time_stats"] --> already in mr if self.pass_white_black_list(Core.Features.MERGE_REQUESTS_NOTES_AWARD_EMOJIS): if not self.use_feature_whitelist is False or Core.Features.MERGE_REQUESTS_NOTES not in self.feature_list: # If there is not a blacklist or notes are not on the blacklist then add notes award emojis. sub_functions[Core.Features.MERGE_REQUESTS_NOTES] = { "attr": ["notes", "list"], "sub_functions": {Core.Features.MERGE_REQUESTS_NOTES_AWARD_EMOJIS: ["awardemojis", "list"]} } elif self.pass_white_black_list(Core.Features.MERGE_REQUESTS_NOTES): sub_functions[Core.Features.MERGE_REQUESTS_NOTES] = ["notes", "list"] if self.pass_white_black_list(Core.Features.MERGE_REQUESTS_AWARD_EMOJIS): sub_functions[Core.Features.MERGE_REQUESTS_AWARD_EMOJIS] = ["awardemojis", "list"] if self.pass_white_black_list(Core.Features.MERGE_REQUESTS_COMMITS): sub_functions[Core.Features.MERGE_REQUESTS_COMMITS] = ["commits"] if self.pass_white_black_list(Core.Features.MERGE_REQUESTS_CHANGES): sub_functions[Core.Features.MERGE_REQUESTS_CHANGES] = ["changes"] if self.pass_white_black_list(Core.Features.MERGE_REQUESTS_DIFFS): sub_functions[Core.Features.MERGE_REQUESTS_DIFFS] = ["diffs", "list"] if self.pass_white_black_list(Core.Features.MERGE_REQUESTS_RESOURCESTATEEVENTS): sub_functions[Core.Features.MERGE_REQUESTS_RESOURCESTATEEVENTS] = ["resourcestateevents", "list"] if self.pass_white_black_list(Core.Features.MERGE_REQUESTS_RESOURCELABELEVENTS): sub_functions[Core.Features.MERGE_REQUESTS_RESOURCELABELEVENTS] = ["resourcelabelevents", "list"] if self.pass_white_black_list(Core.Features.MERGE_REQUESTS_RESOURCEMILESTONESEVENTS): sub_functions[Core.Features.MERGE_REQUESTS_RESOURCEMILESTONESEVENTS] = ["resourcemilestoneevents", "list"] if not self.pass_white_black_list(Core.Features.MERGE_REQUESTS): if not self.use_feature_whitelist: # If the feature is on the blacklist then ignore the feature and its subfeatures return if sub_functions == {}: # If the feature is not on the whitelist then ignore the feature only if no subfeature is on the whitelist return if sub_functions != {}: functions = { "attr": functions, "sub_functions": sub_functions } self.__gitlab_data_producer(self.project, Core.Features.MERGE_REQUESTS, functions)
[docs] def extract_milestones(self) -> None: """ Extracts milestones from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.MILESTONES): return # milestone.issues() --> in issues # milestone.merge_requests() --> in merge requests self.__gitlab_data_producer(self.project, Core.Features.MILESTONES, ["milestones", "list"] )
[docs] def extract_pipelines(self) -> None: """ Extracts pipelines and its sub features from GitLab. Check for update works. If updated, then it will extract jobs, too. """ functions = ["pipelines", "list"] sub_functions = {} if self.pass_white_black_list(Core.Features.PIPELINES_REPORT): sub_functions[Core.Features.PIPELINES_REPORT] = ["test_report", "get"] if self.pass_white_black_list(Core.Features.PIPELINES_BRIDGES): sub_functions[Core.Features.PIPELINES_BRIDGES] = ["bridges", "list"] if self.update_date is not None: sub_functions[Core.Features.JOBS] = ["jobs", "list"] if not self.pass_white_black_list(Core.Features.PIPELINES): if not self.use_feature_whitelist: # If the feature is on the blacklist then ignore the feature and its subfeatures return if sub_functions == {}: # If the feature is not on the whitelist then ignore the feature only if no subfeature is on the whitelist return if sub_functions != {}: functions = { "attr": functions, "sub_functions": sub_functions } self.__gitlab_data_producer(self.project, Core.Features.PIPELINES, functions)
[docs] def extract_triggers(self) -> None: """ Extracts triggers for pipelines from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.TRIGGERS): return self.__gitlab_data_producer(self.project, Core.Features.TRIGGERS, ["triggers", "list"])
[docs] def extract_pipeline_schedules(self) -> None: """ Extracts pipeline schedules for pipelines from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.PIPELINE_SCHEDULES): return self.__gitlab_data_producer(self.project, Core.Features.PIPELINE_SCHEDULES, ["pipelineschedules", "list"])
[docs] def extract_jobs(self) -> None: """ Extracts jobs from GitLab. Check for update works. If updated, then jobs will be extract in pipelines. """ if self.update_date is not None: return if not self.pass_white_black_list(Core.Features.JOBS): return self.__gitlab_data_producer(self.project, Core.Features.JOBS, ["jobs", "list"])
[docs] def extract_project(self) -> None: """ Extracts general project information from GitLab. Check for update does not work. """ def try_len(function, **kwargs): try: obj = function(**kwargs) except GitlabAuthenticationError: print(f"Token can not access {function}") return None else: return len(obj) if not self.pass_white_black_list(Core.Features.PROJECTS): return try: commits = self.project.commits.list(all=True) last_commit_date = commits[0].attributes["created_at"] commit_count = len(commits) except GitlabAuthenticationError: print(f"Token can not access self.project.commits.list") last_commit_date = None commit_count = None project_data = self.__get_gitlab_attributes(self.project.attributes) project_data.update({ "contributor_count": try_len(self.project.repository_contributors,all=True), "member_count": try_len(self.project.members_all.list,all=True), "branch_count": try_len(self.project.branches.list,all=True), "commit_count": commit_count, "last_commit_date": last_commit_date, "labels_count": try_len(self.project.labels.list,all=True), "milestone_count": try_len(self.project.milestones.list,all=True), "merge_requests_count": try_len(self.project.mergerequests.list,all=True), "release_count": try_len(self.project.releases.list,all=True), "issues_count": try_len(self.project.issues.list,all=True) }) projects_df = self.get_pandas_data_frame(Core.Features.PROJECTS) if projects_df is None or projects_df.empty: self.save_as_pandas(Core.Features.PROJECTS,pd.DataFrame([project_data])) return projects_df = projects_df[projects_df.id != project_data['id']] projects_df = pd.concat([projects_df, pd.DataFrame([project_data])], ignore_index=True, sort=False) self.save_as_pandas(Core.Features.PROJECTS,projects_df)
[docs] def extract_releases(self) -> None: """ Extracts releases from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.RELEASES): return self.__gitlab_data_producer(self.project, Core.Features.RELEASES, ["releases", "list"])
[docs] def extract_snippets(self) -> None: """ Extracts snippets from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.SNIPPETS): return self.__gitlab_data_producer(self.project, Core.Features.SNIPPETS, ["snippets", "list"])
[docs] def extract_users(self) -> None: """ Extracts users from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.USERS): return self.__gitlab_data_producer(self.project, Core.Features.USERS, ["users", "list"])
[docs] def extract_wikis(self) -> None: """ Extracts wiki pages from GitLab. Check for update does not work. """ if not self.pass_white_black_list(Core.Features.WIKIS): return self.__gitlab_data_producer(self.project, Core.Features.WIKIS, ["wikis", "list"])
def __log_serial_consumer(self) -> None: """ Method for a consumer thread to log the serial process of extracting data. """ totals = {} counts = {} size = 60 max_text_length = 30 while self.consumer_thread.is_alive() or not self.log_queue.empty(): if not self.log_queue.empty(): feature, total = self.log_queue.get() if feature not in totals: if totals != {}: sys.stdout.write("\n") totals[feature] = total counts[feature] = 0 else: counts[feature] += 1 x = int(size*counts[feature]/total) sys.stdout.flush() text = f" extracting {feature}:" while len(text) < max_text_length: text += " " sys.stdout.write("%s[%s%s] %i/%i\r" % (text, "#"*x, "."*(size-x), counts[feature], total)) sys.stdout.write("\n") sys.stdout.flush() def __log_parallel_consumer(self) -> None: """ Method for a consumer thread to log the parallel process of extracting data. """ features = [] sum = 0 count = 0 size = 60 while self.consumer_thread.is_alive() or not self.log_queue.empty(): if not self.log_queue.empty(): feature, total = self.log_queue.get() if feature not in features: features.append(feature) sum += total else: count += 1 x = int(size*count/sum) sys.stdout.flush() sys.stdout.write("%s[%s%s] %i/%i\r" % (f" extracting parallel: ", "#"*x, "."*(size-x), count, sum)) sys.stdout.write("\n") sys.stdout.flush() def __gitlab_data_consumer(self) -> None: """ Method for a consumer thread to collect the extracted data. After everything is extracted the data will be saved to pandas files. """ buffer = {} while True: if not self.data_queue.empty(): feature_name, gitlab_data = self.data_queue.get() if feature_name is None: break if feature_name in buffer: buffer[feature_name].append(gitlab_data) else: buffer[feature_name] = [gitlab_data] # updated on and after the last day of modification if self.update_date is None: for key, value in buffer.items(): self.save_as_pandas(key,pd.DataFrame(value)) else: # ToDo: Check if there are redudant information for key, value in buffer.items(): if key == self.Features.WIKIS or key == self.Features.BRANCHES or key == self.Features.ISSUE_BOARDS or key == self.Features.ISSUE_BOARDS_LISTS or key == self.Features.LABELS or key == self.Features.USERS: self.save_as_pandas(key,pd.DataFrame(value)) else: feature_df = self.get_pandas_data_frame(key) new_df = pd.concat([pd.DataFrame(value),feature_df], ignore_index=True) if "iid" in new_df: new_df = new_df.drop_duplicates(subset=['iid']) elif "id" in new_df: new_df = new_df.drop_duplicates(subset=['id']) else: print(f"Ids not found in {key}. Might not drop duplicates") new_df = new_df.drop_duplicates() new_df.reset_index() self.save_as_pandas(key,new_df) def __gitlab_data_producer(self, gitlab_obj, feature_name:str, value, is_sub_function:bool = False) -> None: """ Method for a producer thread to extract data from GitLab. It can run mutiple producer loops in order to extract subfeatures After extracting the data is placed in a queue. Parameters ---------- gitlab_obj : Any A GitLab object which can acquire information. Top level object is the GitLab project object. feature_name : str The name of the feature which will be extracted. value : Any The value as dict defines if the feature has subfeatures to extract. The value as list defines a list of methods for the feature in order to extract data. is_sub_function : bool States if this feature is a subfeature and called by main feature. """ if feature_name == Core.Features.ISSUES_CLOSED_BY_MR: x = 0 obj = gitlab_obj has_sub_functions = False if isinstance(value, dict): attrs = value["attr"] has_sub_functions = True else: attrs = value for attr in attrs: obj = getattr(obj, attr) try: if self.update_date is None or self.get_pandas_data_frame_path(feature_name) is None: gitlab_data_list = obj(all=True) else: gitlab_data_list = obj(all=True, since=self.update_date, updated_after=self.update_date, after=self.update_date) except GitlabAuthenticationError: print(f"Token can not access {feature_name}!") return threads = [] if isinstance(gitlab_data_list, dict) or hasattr(gitlab_data_list, "attributes"): self.__producer_loop(gitlab_data_list, gitlab_obj, feature_name, value, has_sub_functions) else: if not is_sub_function and len(gitlab_data_list) > 0: self.log_queue.put((feature_name,len(gitlab_data_list))) for gitlab_data in gitlab_data_list: if has_sub_functions and self.extract_parallel: loop_thread = threading.Thread(target=self.__producer_loop, args=(gitlab_data, gitlab_obj, feature_name, value, has_sub_functions,)) loop_thread.start() threads.append(loop_thread) else: self.__producer_loop(gitlab_data, gitlab_obj, feature_name, value, has_sub_functions) if not is_sub_function: self.log_queue.put((feature_name,len(gitlab_data_list))) for thread in threads: thread.join() def __producer_loop(self, gitlab_data, gitlab_obj, feature_name:str, value, has_sub_functions:bool) -> None: """ Method for a thread to start a new data producer or to extact the data from the attributes. Parameters ---------- gitlab_data : Any Includes the extracted data of the gitlab object. gitlab_obj : Any A GitLab object which can acquire information. Top level object is the GitLab project object. feature_name : str The name of the feature which will be extracted. value : Any The value as dict defines if the feature has subfeatures to extract. The value as list defines a list of methods for the feature in order to extract data. has_sub_function : bool States if this feature has a subfeature and need to be called by current feature. """ if feature_name == Core.Features.ISSUES_CLOSED_BY_MR or feature_name == Core.Features.ISSUES_RELATED_MR: data = {} data["issue_iid"] = gitlab_obj.attributes["iid"] data["mr_iid"] = gitlab_data["iid"] data["project_id"] = gitlab_data["project_id"] self.data_queue.put((feature_name,data)) return if feature_name == Core.Features.MERGE_REQUESTS_COMMITS: data = {} data["mr_iid"] = gitlab_obj.attributes["iid"] data["commit_id"] = gitlab_data.attributes["id"] data["project_id"] = gitlab_data.attributes["project_id"] self.data_queue.put((feature_name,data)) return if isinstance(gitlab_data, dict): if "iid" in gitlab_obj.attributes: self.__get_gitlab_attributes(gitlab_data, feature_name, gitlab_obj.attributes["iid"]) elif "id" in gitlab_obj.attributes: self.__get_gitlab_attributes(gitlab_data, feature_name, gitlab_obj.attributes["id"]) else: parent_id = None if feature_name == Core.Features.MERGE_REQUESTS_COMMITS: parent_id = gitlab_obj.attributes["iid"] ## ad mr changes self.__get_gitlab_attributes(gitlab_data.attributes, feature_name, parent_id) if has_sub_functions: for feature_name2, value2 in value["sub_functions"].items(): self.__gitlab_data_producer(gitlab_data,feature_name2,value2,True) def __get_gitlab_attributes(self, gitlab_data, feature_name:str = None, parent_id = None) -> Union[dict,None]: """ Extract the data from a GitLab feature object and apply special information. Parameters ---------- gitlab_data : Any Includes the extracted data of the gitlab object feature_name : str, default=None The name of the feature which will be extracted. Only if the feature is defined then it will place the data in the data_queue or otherwise it will return the data. parent_id : Any, default=None If the festure has a parent then the parent_id will be added to the data. Returns ------- dict The extracted data will be returned because no feature is defined. None The extracted data will be passed in the data_queue because a feature is defined. """ data = {} if parent_id is not None: if "MRs" in feature_name: data["mr_iid"] = parent_id elif "Commits" in feature_name: data["commit_id"] = parent_id else: data["parent_id"] = parent_id print(f"{feature_name} has a unknown parent id!") for key, value in gitlab_data.items(): if isinstance(value, dict): if key == "commit" and "id" in value: data["commit_id"] = value["id"] elif key == "author" and "id" in value: data["author_id"] = value["id"] elif key == "user" and "id" in value: data["user_id"] = value["id"] elif key == "owner" and "id" in value: data["owner_id"] = value["id"] elif key == "assignee" and "id" in value: data["assignee_id"] = value["id"] elif key == "closed_by" and "id" in value: data["closed_by_id"] = value["id"] elif key == "merged_by" and "id" in value: # deprecated --> merge_user pass elif key == "merge_user" and "id" in value: data["merge_user_id"] = value["id"] elif key == "resolved_by" and "id" in value: data["resolved_by_id"] = value["id"] elif key == "milestone" and "id" in value: data["milestone_id"] = value["id"] elif key == "label" and "id" in value: data["label_id"] = value["id"] elif key == "pipeline" and "id" in value: data["pipeline_id"] = value["id"] elif key == "namespace": for key2, value2 in value.items(): data[f"{key}_{key2}"] = value2 else: data[key] = json.dumps(value) elif isinstance(value, list): if key == "labels": data[key] = value elif key == "assignees": data["assignees_ids"] = [] for assignee in value: data["assignees_ids"].append(assignee["id"]) elif key == "reviewers": data["reviewers_ids"] = [] for reviewer in value: data["reviewers_ids"].append(reviewer["id"]) elif key == "parent_ids": data["parent_ids"] = [] for parent_id in value: data["parent_ids"].append(parent_id) elif key == "tag_list": # deprecated --> topics pass elif key == "topics": data["topics"] = [] for topic in value: data["topics"].append(topic) else: data[key] = json.dumps(value) else: data[key] = value if feature_name is None: return data self.data_queue.put((feature_name,data))