Python Projects: Site emails from a specified URL

Python Web Project-8 with Solution

Create a Python project to get the site emails from a specified URL.

Sample Output:

url -> https://github.com
2 emails found:
[email protected]
[email protected]

Sample Solution:

Python Code:

"""Get the site emails from URL."""
__author__ = "Muhammad Umer Farooq"
__license__ = "MIT"
__version__ = "1.0.0"
__maintainer__ = "Muhammad Umer Farooq"
__email__ = "[email protected]"
__status__ = "Alpha"

import re
from html.parser import HTMLParser
from urllib import parse

import requests
class Parser(HTMLParser):
    def __init__(self, domain: str):
        self.data = []
        self.domain = domain

    def handle_starttag(self, tag: str, attrs: str) -> None:
        This function parse html to take takes url from tags
        # Only parse the 'anchor' tag.
        if tag == "a":
            # Check the list of defined attributes.
            for name, value in attrs:
                # If href is defined, and not empty nor # print it.
                if name == "href" and value != "#" and value != "":
                    # If not already in data.
                    if value not in self.data:
                        url = parse.urljoin(self.domain, value)

# Get main domain name (example.com)
def get_domain_name(url: str) -> str:
    This function get the main domain name
    >>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
    >>> get_domain_name("Not a URL!")
    return ".".join(get_sub_domain_name(url).split(".")[-2:])
# Get sub domain name (sub.example.com)
def get_sub_domain_name(url: str) -> str:
    >>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
    >>> get_sub_domain_name("Not a URL!")
    return parse.urlparse(url).netloc
def emails_from_url(url: str = "https://github.com") -> list:
    This function takes url and return all valid urls
    # Get the base domain from the url
    domain = get_domain_name(url)

    # Initialize the parser
    parser = Parser(domain)

        # Open URL
        r = requests.get(url)

        # pass the raw HTML to the parser to get links

        # Get links and loop through
        valid_emails = set()
        for link in parser.data:
            # open URL.
            # read = requests.get(link)
                read = requests.get(link)
                # Get the valid email.
                emails = re.findall("[a-zA-Z0-9][email protected]" + domain, read.text)
                # If not in list then append it.
                for email in emails:
            except ValueError:
    except ValueError:

    # Finally return a sorted list of email addresses with no duplicates.
    return sorted(valid_emails)
if __name__ == "__main__":
    emails = emails_from_url("https://github.com")
    print(f"{len(emails)} emails found:")


Flowchart: Site emails from a specified URL.
Flowchart: Site emails from a specified URL.


Improve this sample solutions and post your code through Disqus

Follow us on Facebook and Twitter for latest update.

We are closing our Disqus commenting system for some maintenanace issues. You may write to us at reach[at]yahoo[dot]com or visit us at Facebook