Source code for gitlab2pandas.core

from pathlib import Path
import pickle
import pandas as pd
from typing import Union

from gitlab.v4.objects import Project
from gitlab import Gitlab

[docs]class Core():
    """
    Initializes core object with general information.
    Decide wheather to initialize with a project object or with the project namespace and name.
    Extractions can only be done with a project object or after connecting to a server with the project namespace and name.

    Parameters
    ----------
    data_root_dir : str
        A existing top level directory for data extraction.
    project : Project, default=None
        Project object from gitlab.
    project_namespace : str, default=None
        Namespace of the project.
    project_name : str, default=None
        Name of the project.

    """
    
[docs]    class Features():
        # Name <= 31 chars
        USERS = "Users"
        BRANCHES = "Branches"
        RELEASES = "Releases"
        PIPELINES = "Pipelines"
        PIPELINES_REPORT = "PipelinesReport"
        PIPELINES_BRIDGES = "PipelinesBridges"
        JOBS = "Jobs"
        ISSUES = "Issues"
        ISSUES_NOTES = "IssuesNotes"
        ISSUES_AWARD_EMOJIS = "IssuesAwardEmojis"
        ISSUES_NOTES_AWARD_EMOJIS = "IssuesNotesAwardEmojis"
        ISSUES_RESOURCESTATEEVENTS = "IssuesResourcestateevents"
        ISSUES_RESOURCELABELEVENTS = "IssuesResourcelabelevents"
        ISSUES_RESOURCEMILESTONESEVENTS = "IssuesResourcemilestonesevents"
        ISSUES_CLOSED_BY_MR = "IssuesClosedByMR"
        ISSUES_RELATED_MR = "IssuesRelatedMR"
        ISSUES_LINKS = "IssuesLinks"
        MERGE_REQUESTS = "MergeRequests"
        MERGE_REQUESTS_NOTES = "MRsNotes"
        MERGE_REQUESTS_COMMITS = "MRsCommits"
        MERGE_REQUESTS_AWARD_EMOJIS = "MRsAwardEmojis"
        MERGE_REQUESTS_NOTES_AWARD_EMOJIS = "MRsNotesAwardEmojis"
        MERGE_REQUESTS_RESOURCESTATEEVENTS = "MRsResourcestateevents"
        MERGE_REQUESTS_RESOURCELABELEVENTS = "MRsResourcelabelevents"
        MERGE_REQUESTS_CHANGES = "MRsChanges"
        MERGE_REQUESTS_DIFFS = "MRsDiffs"
        MERGE_REQUESTS_RESOURCEMILESTONESEVENTS = "MRsResourcemilestonesevents"
        COMMITS = "Commits"
        COMMITS_COMMENTS = "CommitsComments"
        COMMITS_REFS = "CommitsRefs"
        COMMITS_DIFFS = "CommitsDiffs"
        COMMITS_STATUSES = "CommitStatuses"
        PROJECTS = "Projects"
        EVENTS = "Events"
        ISSUE_BOARDS = "IssueBoards"
        ISSUE_BOARDS_LISTS = "IssueBoardsLists"
        LABELS = "Labels"
        TRIGGERS = "Triggers"
        PIPELINE_SCHEDULES = "PipelineSchedules"
        RUNNERS = "Runners"
        RUNNERS_JOBS = "RunnersJobs"
        SNIPPETS = "Snippets"
        WIKIS = "Wikis"
        MILESTONES = "Milestones"

[docs]        @classmethod
        def to_list(cls) -> list:
            """
            Returns a list of strings with all Features.
            
            Returns
            -------
            list
                A list of strings with all Features.

            """
            features = []
            for var, value in vars(cls).items():
                if isinstance(value,str):
                    if not var.startswith("__"):
                        features.append(value)
            return features

[docs]    class FileTypes():
        PANDAS = ".p"
        JSON = ".json"

    def __init__(self, data_root_dir:str, project:Project = None, project_namespace:str = None, project_name:str = None) -> None:
        """
        Initializes core object with general information.
        Decide wheather to initialize with a project object or with the project namespace and name.
        Extractions can only be done with a project object.
        ToDo: log_level=logging.INFO

        Parameters
        ----------
        data_root_dir : str
            A existing top level directory for data extraction.
        project : Project, default=None
            Project object from gitlab.
        project_namespace : str, default=None
           Namespace of the project.
        project_name : str, default=None
            Name of the project.
    
        """
        self.data_root_dir = data_root_dir
        self.project = project
        self.project_namespace = project_namespace
        self.project_name = project_name
        if project is None and (project_namespace is None or project_name is None):
            raise Exception("Need a project or its namespace and name")
        if project is None:
            self.project_data_dir = Path(self.data_root_dir,project_namespace,project_name)
            self.project_data_dir.mkdir(parents=True, exist_ok=True)
        else:
            self.project_namespace = project.attributes["namespace"]["path"]
            self.project_name = project.attributes["path"]
            self.project_data_dir = Path(self.data_root_dir,project.attributes["path_with_namespace"])
            self.project_data_dir.mkdir(parents=True, exist_ok=True)
        self.input_file_type = self.FileTypes.PANDAS
        self.output_file_type = self.FileTypes.PANDAS

[docs]    def connect(self, server_url:str, private_token:str=None, oauth_token:str=None, job_token:str=None) -> None:
        """
        Get the project object from GitLab and using the project namespace and name. Only public projects can be accessed (read-only) without a token.
        Extraction can be done after a connection.

        Parameters
        ----------
        server_url: str
            Url to the GitLab server.
        private_token : str, default=None
            Private token or personal token for authentication.
        project_name : str, default=None
            Oauth token for authentication
        project_name : str, default=None
            Job token for authentication (to be used in CI).
            
        """
        gitlab_object = None
        if private_token:
            gitlab_object = Gitlab(server_url, private_token=private_token, per_page=100)
        elif oauth_token:
            gitlab_object = Gitlab(server_url, oauth_token=oauth_token, per_page=100)
        elif job_token:
            gitlab_object = Gitlab(server_url, job_token=job_token, per_page=100)
        else:
            # anonymous gitlab instance, read-only for public resources
            gitlab_object = Gitlab(server_url)
        self.project = gitlab_object.projects.get(f"{self.project_namespace}/{self.project_name}", per_page=100)

[docs]    def save_as_pandas(self, filename:str, data:pd.DataFrame) -> None:
        """    
        Saves a pandas DataFrame to the project directory.
        The project metadata will be saved in the top level directory with a filename as pandas file.

        Parameters
        ----------
        filename : str
            Name for the file.
        data : pd.DataFrame
            DataFrame to be saved.
    
        """
        if filename == Core.Features.PROJECTS:
            pd_file = Path(self.data_root_dir, filename + self.output_file_type)
        else:
            pd_file = Path(self.project_data_dir, filename + self.output_file_type)
        if self.output_file_type == self.FileTypes.PANDAS:
            with open(pd_file, "wb") as f:
                pickle.dump(data, f)
        elif self.output_file_type == self.FileTypes.JSON:
            data.to_json(pd_file, indent=2)
    
[docs]    def get_pandas_data_frame(self, filename:str) -> Union[pd.DataFrame,None]:
        """    
        Get a pandas DataFrame from the project directory.
        The project metadata will be excessed from the top level directory.

        Parameters
        ----------
        filename : str
            Name of the file to import.
    
    
        Returns
        -------
        DataFrame
            Return a DataFrame of the existing file.
        None
            Return None because the file does not exists.

        """
        if filename == Core.Features.PROJECTS:
            pd_file = Path(self.data_root_dir, filename + self.input_file_type)
        else:
            pd_file = Path(self.project_data_dir, filename + self.input_file_type)
        if pd_file.is_file():
            if self.input_file_type == self.FileTypes.PANDAS:
                return pd.read_pickle(pd_file)
            elif self.input_file_type == self.FileTypes.JSON:
                return pd.read_json(pd_file)
        else:
            return None
              
[docs]    def get_pandas_data_frame_path(self, filename:str) -> Union[Path,None]:
        """    
        Get a pandas DataFrame path from the project directory.
        The project metadata will be excessed from the top level directory.

        Parameters
        ----------
        filename : str
            Name of the feature to get the file path.
    
    
        Returns
        -------
        Path
            Return a str path of the feature.
        None
            Return None because the file does not exists.

        """
        if filename == Core.Features.PROJECTS:
            pd_file = Path(self.data_root_dir, filename + self.input_file_type)
        else:
            pd_file = Path(self.project_data_dir, filename + self.input_file_type)
        if pd_file.is_file():
            return pd_file
        else:
            return None

[docs]    def set_input_type(self, input_file_type:str) -> bool:
        """    
        Set the input type file and check if the file type is supported by gitlab2pandas.
        Input file type is needed for the update feature.

        Parameters
        ----------
        input_file_type : str
            File ending of the desired input type.
    
        Returns
        -------
        bool
            Return if the input file type was changed.
        """
        if input_file_type == ".p" or input_file_type == ".json":
            self.input_file_type = input_file_type
            return True
        else:
            return False
    
[docs]    def set_output_type(self, output_file_type:str) -> bool:
        """    
        Set the output type file and check if the file type is supported by gitlab2pandas.
        Output file type is needed for the automatically dataframe storage of the extrations.

        Parameters
        ----------
        output_file_type : str
            File ending of the desired input type.
    
        Returns
        -------
        bool
            Return if the input file type was changed.
        """
        if output_file_type == ".p" or output_file_type == ".json":
            self.output_file_type = output_file_type
            return True
        else:
            return False

[docs]    def convert_to_excel(self, excel_filename, features:list = None) -> None:
        """    
        Converts features to an excel file. If no features are passed, then all features will be converted.

        Parameters
        ----------
        excel_filename : str
            Name for the file.
        features : list, default=None
            Features to convert. If no features are passed, then all features will be converted.
            
        """
        writer = pd.ExcelWriter(Path(self.project_data_dir, f'{excel_filename}.xlsx'), engine='xlsxwriter')
        if features is None:
            features = Core.Features.to_list()
        for feature in features:
            df = self.get_pandas_data_frame(feature)
            if df is not None:
                col_times = [col for col in df.columns if any([isinstance(x, pd.Timestamp) for x in df[col]])]
                for col in col_times:
                    df[col] = pd.to_datetime(
                        df[col], infer_datetime_format=True) 
                    df[col] = df[col].dt.tz_localize(None) 
                df.to_excel(writer, sheet_name=feature)
        writer.close()