Browse Source

parsing in python

Nikolaos Alexopoulos 7 years ago
parent
commit
475d9a5d08

BIN
.apt-sec.swp


BIN
.debian-security-advisory.pl.swp


BIN
.debian-security-advisory.py.swp


BIN
.testperl.pl.swp


+ 75 - 9
debian-security-advisory.py

@@ -9,11 +9,18 @@
 
 import re
 import datetime
+from html.parser import HTMLParser
+from bs4 import BeautifulSoup
 import urllib.request
 import logging, sys
 
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
 
+#Testing global variables
+config = dict([('dsa_base_url','https://www.debian.org/security/')])
+state = dict([('next_adv',3700)])
+dsatable = dict()
+
 ## Fetch DSA from debian archive. Can't use tracker since dates are missing.
 ## DSA started counting in November 2000. We'll simply bruteforce which DSA
 ## was in which year and start in 2000 til current.
@@ -64,14 +71,19 @@ def fetchDSA(dsa_id, base_url):
         try:
             flag = False
             logging.info('Opening url: ' + base_url + str(year) + '/dsa-' + dsa_id2string + '\n')
-            dsa = urllib.request.urlopen(base_url + str(year) + '/dsa-' + dsa_id2string).read()
+            req = urllib.request.urlopen(base_url + str(year) + '/dsa-' + dsa_id2string)
+            charset = req.info().get_content_charset()
+            if charset is None:
+                charset = 'utf-8'
+            dsa = req.read().decode(charset)
             return dsa 
         except urllib.error.HTTPError as err:
             if year < current_year:
                 year += 1
                 flag = True
             else:
-                pass
+                dsa = ''
+                return dsa
 ###############################################################################
 
 ## Try to find new DSAs by iteration, return table of DSAs to process
@@ -81,15 +93,69 @@ def checkDSAs(state, config):
 
     logging.info('Checking for new DSAs.. \n')
 
-    if next_dsa < config['first_dsa']:
-         logging.debug('Cache was deleted, starting at DSA ' + str(next_dsa) + '\n')
-         next_dsa = config['first_dsa']
+#    if next_dsa < config['first_dsa']:
+#         logging.debug('Cache was deleted, starting at DSA ' + str(next_dsa) + '\n')
+#         next_dsa = config['first_dsa']
+
+#    if  blacklistedDSA('DSA-' + str(next_dsa)):
+#        next_dsa += 1
 
-    if  blacklistedDSA('DSA-' + str(next_dsa)):
+    dsa = fetchDSA(next_dsa, config['dsa_base_url'])
+    
+    while dsa != '':
+        logging.debug('Got DSA-' + str(next_dsa) + '\n')
+        soup = BeautifulSoup(dsa,'html.parser')
+        #crop the DSA from unecessary weight
+        dsa = soup.find(id="content")
+        if dsa == '':
+            raise NameError('html file format unexpected')
+        dsatable[next_dsa] = str(dsa)
         next_dsa += 1
+#        if  blacklistedDSA('DSA-' + str(next_dsa)):
+#            next_dsa += 1
+        dsa = fetchDSA(next_dsa, config['dsa_base_url'])
+
+    state['next_dsa'] = next_dsa
+    return dsatable
+
+###############################################################################
+
+
+## Parse DSA html data and return array
+## (src-pkg-name date (CVE-id)*)
+
+def parseDSAhtml(dsa):
+    
+    # Date Reported -> dsa_date
+    soup = BeautifulSoup(dsa, 'html.parser')
+    tmp = soup.find("dt",string=re.compile(".*Date Repo.*:"))
+    tmp = str(tmp.find_next().contents[0])
+    dsa_date = tmp.split()
+    if dsa_date == []:
+        print('Unable to extract date. Returning...')
+        raise NameError('file format problem')
+
+    # Affected Packages -> dsa_names
+    #print(dsa)
+    tmp = soup.find("dt",string=re.compile("Affected Packages:"))
+    tmp = tmp.find_next().contents
+
+    for i in tmp:
+        print(i)
+    
+#    m = re.search('<dt>Affected\ Packages:<\/dt>(.*)<\/dd>.*<dt>Vulnerable:',dsa)
+#    if m:
+#        print(len(m.group))
+#        tmpstring = m.group(1)
+#    else:
+#        print('Unable to extract affected packages. Returning...')
+#        raise NameError('html file format unexpected')
+
 
-    dsa = fetchDSA(next_dsa, config['dsa_base_url']
-    while dsa 
 
 
-fetchDSA(3200,'https://www.debian.org/security/')
+#dsa = fetchDSA(3200,'https://www.debian.org/security/')
+dsatable = checkDSAs(state,config)
+#print(dsatable[3701])
+parseDSAhtml(dsatable[3701])
+#checkDSAs(state,config)

+ 5 - 0
testperl.pl

@@ -0,0 +1,5 @@
+#!/usr/bin/perl
+
+my $s = "info info . info <div id=\"inner\"> \n blah blah blah";
+$s =~ s/.*<div\ id="inner">//si;
+print $s