Skip to content

Commit 1fd77b7

Browse files
authored
Merge pull request #144 from KingAkeem/multi_thread_tree
Using multi-threading for tree generation with links and adding documentation
2 parents 65bee14 + a9d945d commit 1fd77b7

6 files changed

Lines changed: 169 additions & 97 deletions

File tree

modules/analyzer.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from ete3 import Tree, TreeStyle, TextFace, add_face_to_node
77
from .link import LinkNode
8+
from .utils import multi_thread
89

910
class LinkTree:
1011
"""
@@ -17,8 +18,8 @@ class LinkTree:
1718
tld (bool): Decides whether or not to use additional top-level-domains besides .tor
1819
stop_depth (int): Depth of which to stop searching for links
1920
"""
20-
def __init__(self, root_node, *, tld=False, stop_depth=1):
21-
self._tree = build_tree(root_node, tld=tld, stop=stop_depth)
21+
def __init__(self, root_node, *, stop_depth=1):
22+
self._tree = build_tree(root_node, stop=stop_depth)
2223

2324
def __len__(self):
2425
return len(self._tree)
@@ -66,10 +67,11 @@ def initialize_tree(root_node):
6667
to_visit (list): Children of root node
6768
"""
6869
root = Tree(name=root_node.name)
69-
children = root_node.get_children()
70+
children = root_node.links
7071
return root, children
7172

72-
def build_tree(link, *, tld, stop=1, rec=0, to_visit=None, tree=None):
73+
74+
def build_tree(link=None, *, stop=1, rec=0, to_visit=None, tree=None):
7375
"""
7476
Builds tree using Breadth First Search. You can specify stop depth.
7577
Rec & tree arguments are used for recursion.
@@ -97,25 +99,32 @@ def build_tree(link, *, tld, stop=1, rec=0, to_visit=None, tree=None):
9799
# If recursion is 0 then sub_tree will be root
98100
return sub_tree if rec == 0 else tree
99101

100-
children_to_visit = list()
101-
for link_name in to_visit:
102+
def visit_nodes(link):
103+
children_to_visit = list()
102104
try:
103-
node = LinkNode(link_name, tld=tld)
105+
node = LinkNode(link)
104106
except (ValueError, ConnectionError, HTTPError):
105-
continue
107+
return None
106108

107109
link_node = sub_tree.add_child(name=node.name)
108-
link_children = node.get_children()
110+
link_children = node.links
109111
# No need to find children if we aren't going to visit them
110112
if stop != rec + 1:
111113
for child in link_children:
112114
link_node.add_child(name=child)
113115
children_to_visit.append(child)
116+
117+
if stop != rec + 1:
118+
return children_to_visit
119+
120+
return to_visit
121+
122+
next_nodes = multi_thread(to_visit, visit_nodes)
114123
rec += 1
115124

116125
# If we've reached stop depth then return tree
117126
if stop == rec:
118127
return sub_tree
119128

120129
new_tree = tree.add_child(sub_tree)
121-
return build_tree(to_visit, tld=tld, stop=stop, rec=rec, tree=new_tree)
130+
return build_tree(to_visit=next_nodes, stop=stop, rec=rec, tree=new_tree)

modules/link.py

Lines changed: 85 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,75 @@
1-
import re
1+
"""
22
3+
This module is used to create a LinkNode that can be consumued by a LinkTree
4+
and contains useful Link methods
5+
6+
"""
37
import requests
48
import requests.exceptions
59
import validators
610

711
from bs4 import BeautifulSoup
12+
from .utils import multi_thread
813
from .color import color
914

15+
def get_emails(node):
16+
"""Finds all emails associated with node
17+
18+
Args:
19+
node (LinkNode): node used to get emails from
20+
Returns:
21+
emails (list): list of emails
22+
"""
23+
emails = []
24+
for child in node.children:
25+
link = child.get('href')
26+
if link and 'mailto' in link:
27+
email_addr = link.split(':')
28+
if LinkNode.valid_email(email_addr[1]) and len(email_addr) > 1:
29+
emails.append(email_addr[1])
30+
return emails
31+
32+
33+
def get_links(node):
34+
"""Finds all links associated with node
35+
36+
Args:
37+
node (LinkNode): node used to get links from
38+
Returns:
39+
links (list): list of links
40+
"""
41+
def retrieve_link(child):
42+
link = child.get('href')
43+
if link and LinkNode.valid_link(link):
44+
return link
45+
return None
46+
47+
return multi_thread(node.children, retrieve_link)
48+
49+
1050
class LinkNode:
51+
"""Represents link node in a link tree
52+
53+
Attributes:
54+
link (str): link to be used as node
55+
"""
1156

12-
def __init__(self, link, *, tld=False):
57+
def __init__(self, link):
58+
# If link has invalid form, throw an error
1359
if not self.valid_link(link):
1460
raise ValueError("Invalid link format.")
1561

16-
self.tld = tld
1762
self._children = []
1863
self._emails = []
64+
self._links = []
1965

66+
# Attempts to connect to link, throws an error if link is unreachable
2067
try:
2168
self.response = requests.get(link)
22-
except (requests.exceptions.ChunkedEncodingError, requests.exceptions.HTTPError, requests.exceptions.ConnectionError, ConnectionError) as err:
69+
except (requests.exceptions.ChunkedEncodingError,
70+
requests.exceptions.HTTPError,
71+
requests.exceptions.ConnectionError,
72+
ConnectionError) as err:
2373
raise err
2474

2575
self._node = BeautifulSoup(self.response.text, 'html.parser')
@@ -30,43 +80,43 @@ def __init__(self, link, *, tld=False):
3080
self.name = self._node.title.string
3181
self.status = color(link, 'green')
3282

33-
def get_emails(self):
34-
if self._emails:
35-
return self._emails
36-
37-
children = self._node.find_all('a')
38-
email_nodes = []
39-
for child in children:
40-
link = child.get('href')
41-
if link and 'mailto' in link:
42-
email_addr = link.split(':')
43-
if self.valid_email(email_addr[1]) and len(email_addr) > 1:
44-
email_nodes.append(email_addr[1])
45-
self._emails = email_nodes
46-
return email_nodes
47-
48-
def get_children(self):
49-
if self._children:
50-
return self._children
51-
52-
children = self._node.find_all('a')
53-
child_nodes = []
54-
for child in children:
55-
link = child.get('href')
56-
if link and self.valid_link(link):
57-
child_nodes.append(link)
58-
59-
self._children = child_nodes
60-
return child_nodes
83+
@property
84+
def emails(self):
85+
"""
86+
Getter for node emails
87+
"""
88+
if not self._emails:
89+
self._emails = get_emails(self)
90+
return self._emails
91+
92+
@property
93+
def links(self):
94+
"""
95+
Getter for node links
96+
"""
97+
if not self._links:
98+
self._links = get_links(self)
99+
return self._links
100+
101+
@property
102+
def children(self):
103+
"""
104+
Getter for node children
105+
"""
106+
if not self._children:
107+
self._children = self._node.find_all('a')
108+
return self._children
61109

62110
@staticmethod
63111
def valid_email(email):
64-
if validators.email(email):
65-
return True
66-
return False
112+
"""Static method used to validate emails"""
113+
if validators.email(email):
114+
return True
115+
return False
67116

68117
@staticmethod
69118
def valid_link(link):
119+
"""Static method used to validate links"""
70120
if validators.url(link):
71121
return True
72122
return False

modules/link_io.py

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,44 +9,56 @@
99
from .color import color
1010

1111
class LinkIO:
12-
12+
"""
13+
This class is only used to interact with links
14+
"""
1315
@staticmethod
14-
def display_children(link_node):
15-
children = link_node.get_children()
16-
sucess_msg = color(f'Links Found - {len(children)}', 'green')
16+
def display_children(root):
17+
"""
18+
Static method to display status of child nodes
19+
20+
Args:
21+
root (LinkNode): root of children to be displayed
22+
"""
23+
sucess_msg = color(f'Links Found - {len(root.links)}', 'green')
1724
print(sucess_msg + '\n' + '---------------------------------')
18-
multi_thread(children, LinkIO.display)
25+
multi_thread(root.links, LinkIO.display)
1926

2027
@staticmethod
2128
def read(link, *, response=False, show_msg=False, headers=None, schemes=None):
2229
"""
2330
Attempts to retrieve HTML from link
2431
2532
Args:
26-
headers (dict)
27-
schemes (list)
33+
link (str): link to read
34+
response (bool): determines if response is returned.
35+
show_msg(bool): determines if message is displayed for connection
36+
headers (dict): header for request, defaults to None
37+
schemes (list): differenct schemes to attempt to use
2838
Returns:
29-
resp.text (str): html from page
39+
str: html from page
40+
requests.Response (optional): response returned from requests
41+
3042
"""
3143
headers = {'User-Agent': 'XXXX-XXXXX-XXXX'} if not headers else headers
3244
# Attempts to connect directly to site if no scheme is passed
3345
if not schemes:
3446
if show_msg:
3547
print(f'Attempting to connect to {link}')
3648
if LinkNode.valid_link(link):
37-
node = LinkNode(link, tld=True)
49+
node = LinkNode(link)
3850
if response:
3951
return node.response.text, node.response
4052
return node.response.text
4153

4254
schemes = ['https://', 'http://'] if not schemes else schemes
43-
55+
# Attempt to use different schemes until one is successful
4456
for scheme in schemes:
4557
temp_url = scheme + link
4658
if show_msg:
4759
print(f'Attempting to connect to {link}')
4860
if LinkNode.valid_link(temp_url):
49-
node = LinkNode(temp_url, tld=True)
61+
node = LinkNode(temp_url)
5062
if response:
5163
return node.response.text, node.response
5264
return node.response.text
@@ -55,26 +67,31 @@ def read(link, *, response=False, show_msg=False, headers=None, schemes=None):
5567
@staticmethod
5668
def display(link):
5769
"""
58-
Prints the status of a link
70+
Prints the status of a link based on it's connection status
71+
72+
Args:
73+
link (str): link to get status of
5974
"""
6075
if LinkNode.valid_link(link):
6176
try:
62-
node = LinkNode(link, tld=True)
77+
node = LinkNode(link)
6378
title = node.name
6479
link_status = node.status
65-
except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, ConnectionError):
80+
except (requests.exceptions.HTTPError,
81+
requests.exceptions.ConnectionError,
82+
ConnectionError):
6683
title = 'Not Found'
6784
link_status = color(link, 'red')
6885

69-
print("%-80s %-30s" % (link_status, title))
86+
status_msg = "%-80s %-30s" % (link_status, title)
87+
print(status_msg)
7088

7189

7290
@staticmethod
7391
def display_ip():
74-
"""Returns users tor ip address
75-
92+
"""
7693
https://check.torproject.org/ tells you if you are using tor and it
77-
displays your IP address which we scape and return
94+
displays your IP address which we scape and display
7895
"""
7996

8097
page = LinkIO.read('https://check.torproject.org/', show_msg=True)

modules/tests/test_getweblinks.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import pytest
55
import requests_mock
66

7-
from bs4 import BeautifulSoup
87
from yattag import Doc
98
from ..link import LinkNode
109

@@ -24,7 +23,7 @@ def setup_html(test_links, *, fail=False):
2423
with tag('body'):
2524
for data in test_links:
2625
if not fail:
27-
line('a', 'test_anchor', href=data)
26+
line('a', 'test_anchor', href=data)
2827

2928
return doc.getvalue()
3029

@@ -45,7 +44,7 @@ def test_get_links_fail():
4544
mock_connection.register_uri('GET', data, text=mock_html)
4645
with pytest.raises(ValueError):
4746
node = LinkNode(data)
48-
result = node.get_children()
47+
result = node.links
4948
assert result == []
5049

5150
@pytest.fixture
@@ -61,11 +60,10 @@ def test_get_links_tor():
6160
mock_html = setup_html(test_data)
6261
mock_link = 'http://test.tor'
6362
with requests_mock.Mocker() as mock_connection:
64-
for data in test_data:
65-
mock_connection.register_uri('GET', mock_link, text=mock_html)
63+
mock_connection.register_uri('GET', mock_link, text=mock_html)
6664

6765
node = LinkNode(mock_link)
68-
result = node.get_children()
66+
result = node.links
6967
assert result == test_data
7068

7169

@@ -93,7 +91,7 @@ def test_get_links_tld():
9391
mock_connection.register_uri('GET', mock_url, text=mock_html)
9492

9593
node = LinkNode(mock_url)
96-
links = node.get_children()
94+
links = node.links
9795
assert links == test_data
9896

9997

0 commit comments

Comments
 (0)