GetQuestionsList¶

Get the list of problems and their information

from leetscrape import GetQuestionsList

ls = GetQuestionsList()
ls.scrape() # Scrape the list of questions
ls.questions.head() # Get the list of questions

	QID	title	titleSlug	difficulty	acceptanceRate	paidOnly	topicTags	categorySlug
0	1	Two Sum	two-sum	Easy	51.4225	False	array,hash-table	algorithms
1	2	Add Two Numbers	add-two-numbers	Medium	41.9051	False	linked-list,math,recursion	algorithms
2	3	Longest Substring Without Repeating Characters	longest-substring-without-repeating-characters	Medium	34.3169	False	hash-table,string,sliding-window	algorithms
3	4	Median of Two Sorted Arrays	median-of-two-sorted-arrays	Hard	38.8566	False	array,binary-search,divide-and-conquer	algorithms
4	5	Longest Palindromic Substring	longest-palindromic-substring	Medium	33.4383	False	string,dynamic-programming	algorithms

You can export the associated tables to a directory using the to_csv method:

ls.to_csv(directory="<dir>")

This generates 6 .csv files in the current directory:
- questions.csv - List of questions with their title, difficulty, acceptance rate, paid status, topic tags, and category.
- companies.csv - List of companies with their name, slug, and the questions count.
- topicsTags.csv - List of topic tags with their name and slug.
- categories.csv - List of categories with their name and slug.
- questionCategory.csv - An edgelist of questions and their categories.
- questionTopics.csv - An edgelist of questions and their topic tags.

A class to scrape the list of questions, their topic tags, and company tags.

Parameters:

Name	Type	Description	Default
`limit`	`int`	The maximum number of questions to query for from Leetcode’s graphql API. Defaults to 10,000.	`10000`

Source code in src/leetscrape/questions_list.py

class GetQuestionsList:
    """A class to scrape the list of questions, their topic tags, and company tags.

    Args:
        limit (int, optional): The maximum number of questions to query for from Leetcode's graphql API. Defaults to 10,000.
    """

    def __init__(self, limit: int = 10_000):
        self.limit = limit

    def scrape(self):
        """Scrapes LeetCode data including company tags, questions, question topics,
        and categories.
        """
        self._scrape_companies()
        self._scrape_questions_list()
        self._extract_question_topics()
        self._get_categories_and_topicTags_lists()
        self._scrape_question_category()
        self._add_category_to_questions_list()

    def to_csv(self, directory: str) -> None:
        """A method to export the scraped data into csv files in preparation for
        injection into a database.

        Args:
            directory (str): The directory path to export the scraped data into.
        """
        self.companies.to_csv(directory + "companies.csv", index=False)
        self.questions["QID"] = self.questions["QID"].astype(int)
        self.questions.to_csv(directory + "questions.csv", index=False)
        self.questionTopics.to_csv(
            directory + "questionTopics.csv", index=True, index_label="id"
        )
        self.categories.to_csv(directory + "categories.csv", index=False)
        self.topicTags.to_csv(directory + "topicTags.csv", index=False)
        self.questionCategory.to_csv(
            directory + "questionCategory.csv", index=True, index_label="id"
        )

    def _scrape_companies(self):
        """Scrape the company tags of each question. This always returns an empty
        dataframe as this is a paid only feature."""
        print("Scraping companies ... ", end="")
        data = {
            "query": """query questionCompanyTags {
                    companyTags {
                        name
                        slug
                        questionCount
                    }
                }
            """,
            "variables": {},
        }
        r = requests.post("https://leetcode.com/graphql", json=data).json()
        self.companies = pd.json_normalize(r["data"]["companyTags"])
        print("Done")

    def _scrape_questions_list(self):
        """
        Scrapes the list of questions from leetcode.com and store them in the 'questions' dataframe. The columns include the question QID, acceptance rate, difficulty, title, titleSlug, and topic tags. It also has a column indicating whether the question is available only to Leetcode's paying customers.
        """
        print("Scraping questions list ... ", end="")
        data = {
            "query": """query problemsetQuestionList($categorySlug: String, $limit: Int, $skip: Int, $filters: QuestionListFilterInput) {
                    problemsetQuestionList: questionList(
                        categorySlug: $categorySlug
                        limit: $limit
                        skip: $skip
                        filters: $filters
                    ) {
                        total: totalNum
                        questions: data {
                            acceptanceRate: acRate
                            difficulty
                            QID: questionFrontendId
                            paidOnly: isPaidOnly
                            title
                            titleSlug
                            topicTags {
                                slug
                            }
                        }
                    }
                }
            """,
            "variables": {
                "categorySlug": "",
                "skip": 0,
                "limit": self.limit,
                "filters": {},
            },
        }

        r = requests.post("https://leetcode.com/graphql", json=data).json()
        self.questions = pd.json_normalize(
            r["data"]["problemsetQuestionList"]["questions"]
        )[
            [
                "QID",
                "title",
                "titleSlug",
                "difficulty",
                "acceptanceRate",
                "paidOnly",
                "topicTags",
            ]
        ]
        self.questions["topicTags"] = self.questions["topicTags"].apply(
            lambda w: [tag["slug"] for tag in w]
        )
        print("Done")

    def _extract_question_topics(self):
        """Create a table with the edge list of questions and topic tags."""
        print("Extracting question topics ... ", end="")
        self.questionTopics = (
            self.questions[["QID", "topicTags"]]
            .rename(columns={"topicTags": "tagSlug"})
            .explode("tagSlug", ignore_index=True)
        ).dropna()
        print("Done")

    def _get_categories_and_topicTags_lists(self):
        """Get the categories and topic tags of LeetCode problems and store them in the
        'categories' and 'topicTags' attribute respectively."""
        print("Getting Categories ... ", end="")
        # List of problem categories
        self.categories = pd.DataFrame.from_records(CATEGORIES)
        print("Done")
        # List of problem topic tags
        print("Scraping Topic Tags ... ", end="")
        self.topicTags = pd.DataFrame.from_records(TOPIC_TAGS)
        print("Done")

    def _scrape_question_category(self):
        """Scrape the category of each question and store it in the 'questionCategory' dataframe."""
        print("Extracting question category ... ", end="")
        categories_data = []
        for category in self.categories["slug"].values:
            data = {
                "query": """query problemsetQuestionList($categorySlug: String, $limit: Int, $skip: Int, $filters: QuestionListFilterInput) {
                        problemsetQuestionList: questionList(
                            categorySlug: $categorySlug
                            limit: $limit
                            skip: $skip
                            filters: $filters
                        ) {
                            questions: data {
                                QID: questionFrontendId
                            }
                        }
                    }
                """,
                "variables": {
                    "categorySlug": category,
                    "skip": 0,
                    "limit": self.limit,
                    "filters": {},
                },
            }

            r = requests.post("https://leetcode.com/graphql", json=data).json()
            categories = pd.json_normalize(
                r["data"]["problemsetQuestionList"]["questions"]
            )
            categories["categorySlug"] = category
            categories_data.append(categories)
        self.questionCategory = pd.concat(categories_data, axis=0, ignore_index=True)
        print("Done")

    def _add_category_to_questions_list(self):
        """Adds the `topicTags` column containing the comma-separated string of
        the list of topic tags relevant to the given questions and the `category`
        column that includes the category relevant to the given question"""
        self.questions["topicTags"] = self.questions["topicTags"].apply(
            lambda w: ",".join(w)
        )
        self.questions = self.questions.join(
            self.questionCategory.set_index("QID"), on="QID"
        )

`scrape()` ¶

Scrapes LeetCode data including company tags, questions, question topics, and categories.

Source code in src/leetscrape/questions_list.py

def scrape(self):
    """Scrapes LeetCode data including company tags, questions, question topics,
    and categories.
    """
    self._scrape_companies()
    self._scrape_questions_list()
    self._extract_question_topics()
    self._get_categories_and_topicTags_lists()
    self._scrape_question_category()
    self._add_category_to_questions_list()

`to_csv(directory)` ¶

A method to export the scraped data into csv files in preparation for injection into a database.

Parameters:

Name	Type	Description	Default
`directory`	`str`	The directory path to export the scraped data into.	required

Source code in src/leetscrape/questions_list.py

def to_csv(self, directory: str) -> None:
    """A method to export the scraped data into csv files in preparation for
    injection into a database.

    Args:
        directory (str): The directory path to export the scraped data into.
    """
    self.companies.to_csv(directory + "companies.csv", index=False)
    self.questions["QID"] = self.questions["QID"].astype(int)
    self.questions.to_csv(directory + "questions.csv", index=False)
    self.questionTopics.to_csv(
        directory + "questionTopics.csv", index=True, index_label="id"
    )
    self.categories.to_csv(directory + "categories.csv", index=False)
    self.topicTags.to_csv(directory + "topicTags.csv", index=False)
    self.questionCategory.to_csv(
        directory + "questionCategory.csv", index=True, index_label="id"
    )

GetQuestionsList¶

scrape() ¶

to_csv(directory) ¶

`scrape()` ¶

`to_csv(directory)` ¶