diff --git a/handelsregister.py b/handelsregister.py
index 03ccc1a..137ce9b 100755
--- a/handelsregister.py
+++ b/handelsregister.py
@@ -12,6 +12,7 @@
import sys
from bs4 import BeautifulSoup
import urllib.parse
+import xml.etree.ElementTree as ET
# Dictionaries to map arguments to values
schlagwortOptionen = {
@@ -20,6 +21,24 @@
"exact": 3
}
+# XJustiz namespace
+NS = {'tns': 'http://www.xjustiz.de'}
+
+# Role code to label mapping (from xjustiz codeliste:gds.rollenbezeichnung)
+ROLE_CODES = {
+ '086': 'Geschäftsführer(in)',
+ '087': 'Vorstand',
+ '285': 'Prokurist(in)',
+ '287': 'Rechtsträger(in)',
+ '288': 'Registergericht',
+ '215': 'Einreicher(in)',
+ '061': 'Gesellschafter(in)',
+ '062': 'Kommanditist(in)',
+ '063': 'Persönlich haftende(r) Gesellschafter(in)',
+ '089': 'Liquidator(in)',
+ '297': 'Inhaber(in)',
+}
+
class HandelsRegister:
def __init__(self, args):
self.args = args
@@ -49,7 +68,7 @@ def __init__(self, args):
),
( "Connection", "keep-alive" ),
]
-
+
self.cachedir = pathlib.Path(tempfile.gettempdir()) / "handelsregister_cache"
self.cachedir.mkdir(parents=True, exist_ok=True)
@@ -96,8 +115,294 @@ def search_company(self):
# TODO catch the situation if there's more than one company?
# TODO get all documents attached to the exact company
# TODO parse useful information out of the PDFs
+ self._last_search_html = html
return get_companies_in_searchresults(html)
+ def fetch_company_detail(self, result_index=0):
+ """Fetch the SI (Strukturierter Registerinhalt) document for a search result.
+
+ Must be called after search_company(). Uses the cached search HTML to find
+ the SI link's PrimeFaces submit params, then submits the form to fetch the
+ XML document.
+
+ Returns a dict of parsed company detail fields, or None if SI is unavailable.
+ """
+ html = self._last_search_html
+ soup = BeautifulSoup(html, 'html.parser')
+
+ # Find the SI link for the given result index
+ si_link = None
+ for a in soup.find_all('a'):
+ link_id = a.get('id', '')
+ # Match pattern: ergebnissForm:selectedSuchErgebnisFormTable:{index}:...:fade_
+ if ':{}:'.format(result_index) in link_id:
+ span = a.find('span')
+ if span and span.text.strip() == 'SI':
+ si_link = a
+ break
+
+ if not si_link:
+ return None
+
+ onclick = si_link.get('onclick', '')
+ if not onclick:
+ return None
+
+ # Parse PrimeFaces.addSubmitParam params from onclick
+ pairs = re.findall(r"'([^']+)':'([^']*)'", onclick)
+ if not pairs:
+ return None
+
+ # Select the ergebnissForm and inject hidden params
+ self.browser.select_form(name='ergebnissForm')
+ for key, value in pairs:
+ self.browser.form.new_control('hidden', key, {'value': value})
+ self.browser.form.fixup()
+
+ response_si = self.browser.submit()
+ si_xml = response_si.read().decode('utf-8')
+
+ # Cache the SI XML
+ cachename = self.companyname2cachename(self.args.schlagwoerter + '_SI')
+ with open(cachename, 'w') as f:
+ f.write(si_xml)
+
+ return parse_si_detail(si_xml)
+
+
+def _build_comment_map(xml_str):
+ """Build a map from XML element code values to their preceding comments.
+
+ XJustiz uses XML comments to provide human-readable labels for coded values, e.g.:
+ 221110
+ ElementTree drops comments during parsing, so we extract them from the raw string.
+ """
+ comment_map = {}
+ for match in re.finditer(r'\s*([^<]+)', xml_str):
+ comment_map[match.group(2)] = match.group(1)
+ return comment_map
+
+
+def parse_si_detail(xml_str):
+ """Parse the SI (Strukturierter Registerinhalt) XJustiz XML into a dict.
+
+ The SI document is an XJustiz XML file containing structured register content:
+ company name, address, legal form, capital, directors, prokura holders, etc.
+ """
+ root = ET.fromstring(xml_str)
+ comment_map = _build_comment_map(xml_str)
+ detail = {}
+
+ # Build a role-number-to-person/org mapping from beteiligung entries
+ roles = {} # rollennummer -> {role_code, role_label, person_data}
+ for beteiligung in root.findall('.//tns:beteiligung', NS):
+ rolle_elems = beteiligung.findall('tns:rolle', NS)
+ beteiligter = beteiligung.find('tns:beteiligter', NS)
+ if beteiligter is None:
+ continue
+
+ for rolle in rolle_elems:
+ rollennummer = _text(rolle, 'tns:rollennummer')
+ role_code_elem = rolle.find('tns:rollenbezeichnung', NS)
+ role_code = _text(role_code_elem, 'code') if role_code_elem is not None else None
+ role_label = ROLE_CODES.get(role_code, comment_map.get(role_code, role_code))
+
+ person_data = _parse_beteiligter(beteiligter, comment_map)
+ if rollennummer:
+ roles[rollennummer] = {
+ 'role_code': role_code,
+ 'role_label': role_label,
+ **person_data
+ }
+
+ # Company info from the Rechtsträger (role code 287)
+ for rnum, rdata in roles.items():
+ if rdata.get('role_code') == '287':
+ detail['name'] = rdata.get('name')
+ detail['legal_form'] = rdata.get('legal_form')
+ detail['legal_form_code'] = rdata.get('legal_form_code')
+ detail['seat'] = rdata.get('seat')
+ if rdata.get('address'):
+ detail['address'] = rdata['address']
+ break
+
+ # basisdatenRegister
+ basisdaten = root.find('.//tns:basisdatenRegister', NS)
+ if basisdaten is not None:
+ # Satzungsdatum
+ satzung = basisdaten.find('.//tns:aktuellesSatzungsdatum', NS)
+ if satzung is not None and satzung.text:
+ detail['articles_of_association_date'] = satzung.text
+
+ # Gegenstand (business purpose)
+ gegenstand = basisdaten.find('.//tns:gegenstand', NS)
+ if gegenstand is not None and gegenstand.text:
+ detail['business_purpose'] = gegenstand.text.strip()
+
+ # Vertretungsregelung (representation rules)
+ allg_vertretung = basisdaten.find('.//tns:allgemeineVertretungsregelung', NS)
+ if allg_vertretung is not None:
+ vb = allg_vertretung.find('.//tns:vertretungsbefugnis', NS)
+ if vb is not None:
+ vb_code = _text(vb, 'code')
+ if vb_code and vb_code in comment_map:
+ detail['representation_rules'] = comment_map[vb_code]
+
+ # Representatives with their specific rules
+ for vb_elem in basisdaten.findall('.//tns:vertretungsberechtigte', NS):
+ ref = _text(vb_elem, 'tns:ref.rollennummer')
+ if ref and ref in roles:
+ rep = roles[ref]
+ besondere = vb_elem.find('tns:besondereVertretungsregelung', NS)
+ if besondere is not None:
+ freitext = besondere.find('.//tns:vertretungsbefugnisFreitext', NS)
+ if freitext is not None and freitext.text:
+ rep['representation'] = freitext.text.strip().rstrip(';')
+ else:
+ vb_code_elem = besondere.find('.//tns:vertretungsbefugnis', NS)
+ if vb_code_elem is not None:
+ code_val = _text(vb_code_elem, 'code')
+ if code_val and code_val in comment_map:
+ rep['representation'] = comment_map[code_val]
+
+ befreiung = besondere.find('.//tns:befreiungVon181BGB', NS)
+ if befreiung is not None:
+ rep['exempt_181_bgb'] = True
+
+ # Collect managing directors and prokura holders
+ directors = []
+ prokura = []
+ for rnum, rdata in roles.items():
+ if rdata.get('role_code') == '086': # Geschäftsführer
+ directors.append(rdata)
+ elif rdata.get('role_code') == '087': # Vorstand
+ directors.append(rdata)
+ elif rdata.get('role_code') == '285': # Prokurist
+ prokura.append(rdata)
+
+ if directors:
+ detail['directors'] = directors
+ if prokura:
+ detail['prokura'] = prokura
+
+ # Capital (Stammkapital / Grundkapital)
+ stammkapital = root.find('.//tns:stammkapital', NS)
+ if stammkapital is None:
+ stammkapital = root.find('.//tns:grundkapital', NS)
+ if stammkapital is not None:
+ zahl = _text(stammkapital, 'tns:zahl')
+ waehrung_elem = stammkapital.find('.//tns:waehrung', NS)
+ waehrung_code = _text(waehrung_elem, 'code') if waehrung_elem is not None else None
+ if zahl:
+ detail['capital'] = {
+ 'amount': zahl,
+ 'currency': waehrung_code or 'EUR'
+ }
+
+ # Register info
+ aktenzeichen = root.find('.//tns:aktenzeichen.strukturiert', NS)
+ if aktenzeichen is not None:
+ register = aktenzeichen.find('tns:register', NS)
+ nummer = _text(aktenzeichen, 'tns:laufendeNummer')
+ if register is not None and nummer:
+ reg_code = _text(register, 'code')
+ detail['register_type'] = reg_code
+ detail['register_number'] = nummer
+
+ # Auszug metadata
+ auszug = root.find('.//tns:auszug', NS)
+ if auszug is not None:
+ detail['retrieval_date'] = _text(auszug, 'tns:abrufdatum')
+ detail['last_entry_date'] = _text(auszug, 'tns:letzteEintragung')
+ detail['num_entries'] = _text(auszug, 'tns:anzahlEintragungen')
+
+ # Eintragungstexte (register entry texts)
+ entries = []
+ for et_elem in root.findall('.//tns:eintragungstext', NS):
+ entry = {}
+ entry['column'] = _text(et_elem, 'tns:spalte')
+ entry['position'] = _text(et_elem, 'tns:position')
+ entry['number'] = _text(et_elem, 'tns:laufendeNummer')
+ entry['text'] = _text(et_elem, 'tns:text')
+ art_elem = et_elem.find('tns:eintragungsart', NS)
+ if art_elem is not None:
+ art_code = _text(art_elem, 'code')
+ if art_code and art_code in comment_map:
+ entry['type'] = comment_map[art_code]
+ entries.append(entry)
+ if entries:
+ detail['register_entries'] = entries
+
+ return detail
+
+
+def _text(parent, path):
+ """Extract text from an XML element, or None."""
+ if parent is None:
+ return None
+ elem = parent.find(path, NS)
+ if elem is not None and elem.text:
+ return elem.text.strip()
+ return None
+
+
+def _parse_beteiligter(beteiligter, comment_map):
+ """Parse a beteiligter element into a person/org dict."""
+ data = {}
+
+ # Natural person
+ person = beteiligter.find('.//tns:natuerlichePerson', NS)
+ if person is not None:
+ vorname = _text(person, './/tns:vorname')
+ nachname = _text(person, './/tns:nachname')
+ if vorname and nachname:
+ data['name'] = '%s %s' % (vorname, nachname)
+ elif nachname:
+ data['name'] = nachname
+
+ geburtsdatum = _text(person, './/tns:geburtsdatum')
+ if geburtsdatum:
+ data['date_of_birth'] = geburtsdatum
+
+ anschrift = person.find('tns:anschrift', NS)
+ if anschrift is not None:
+ data['city'] = _text(anschrift, 'tns:ort')
+
+ return data
+
+ # Organisation
+ org = beteiligter.find('.//tns:organisation', NS)
+ if org is not None:
+ data['name'] = _text(org, './/tns:bezeichnung.aktuell')
+
+ rechtsform = org.find('.//tns:rechtsform', NS)
+ if rechtsform is not None:
+ rf_code = _text(rechtsform, 'code')
+ if rf_code:
+ data['legal_form'] = comment_map.get(rf_code, rf_code)
+ data['legal_form_code'] = rf_code
+
+ sitz = org.find('tns:sitz', NS)
+ if sitz is not None:
+ data['seat'] = _text(sitz, 'tns:ort')
+
+ anschrift = org.find('tns:anschrift', NS)
+ if anschrift is not None:
+ addr = {}
+ street = _text(anschrift, 'tns:strasse')
+ hausnummer = _text(anschrift, 'tns:hausnummer')
+ if street:
+ addr['street'] = street + (' ' + hausnummer if hausnummer else '')
+ plz = _text(anschrift, 'tns:postleitzahl')
+ if plz:
+ addr['postal_code'] = plz
+ ort = _text(anschrift, 'tns:ort')
+ if ort:
+ addr['city'] = ort
+ if addr:
+ data['address'] = addr
+
+ return data
def parse_result(result):
@@ -106,7 +411,7 @@ def parse_result(result):
cells.append(cell.text.strip())
d = {}
d['court'] = cells[1]
-
+
# Extract register number: HRB, HRA, VR, GnR followed by numbers (e.g. HRB 12345, VR 6789)
# Also capture suffix letter if present (e.g. HRB 12345 B), but avoid matching start of words (e.g. " Formerly")
reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+(\s+[A-Z])?(?!\w)', d['court'])
@@ -147,10 +452,65 @@ def pr_company_info(c):
for name, loc in c.get('history'):
print(name, loc)
+def pr_company_detail(detail):
+ """Print the SI detail fields in human-readable format."""
+ if not detail:
+ print(' (no detail data available)')
+ return
+
+ print()
+ print('--- Detail (SI) ---')
+ for key in ('name', 'legal_form', 'seat', 'business_purpose',
+ 'articles_of_association_date', 'representation_rules'):
+ if key in detail:
+ label = key.replace('_', ' ').title()
+ print('%s: %s' % (label, detail[key]))
+
+ if 'address' in detail:
+ addr = detail['address']
+ parts = []
+ if 'street' in addr:
+ parts.append(addr['street'])
+ if 'postal_code' in addr and 'city' in addr:
+ parts.append('%s %s' % (addr['postal_code'], addr['city']))
+ elif 'city' in addr:
+ parts.append(addr['city'])
+ print('Address: %s' % ', '.join(parts))
+
+ if 'capital' in detail:
+ cap = detail['capital']
+ print('Capital: %s %s' % (cap['amount'], cap['currency']))
+
+ if 'directors' in detail:
+ print('Directors:')
+ for d in detail['directors']:
+ extra = ''
+ if d.get('representation'):
+ extra = ' (%s)' % d['representation']
+ print(' %s%s' % (d.get('name', '?'), extra))
+
+ if 'prokura' in detail:
+ print('Prokura:')
+ for p in detail['prokura']:
+ extra = ''
+ if p.get('representation'):
+ extra = ' (%s)' % p['representation']
+ print(' %s%s' % (p.get('name', '?'), extra))
+
+ if 'register_entries' in detail:
+ print('Register Entries:')
+ for entry in detail['register_entries']:
+ etype = entry.get('type', '')
+ text = entry.get('text', '')
+ print(' [%s] %s' % (etype, text))
+
+ print('Last Entry: %s' % detail.get('last_entry_date', '-'))
+ print('Retrieval Date: %s' % detail.get('retrieval_date', '-'))
+
def get_companies_in_searchresults(html):
soup = BeautifulSoup(html, 'html.parser')
grid = soup.find('table', role='grid')
-
+
results = []
for result in grid.find_all('tr'):
a = result.get('data-ri')
@@ -195,6 +555,12 @@ def parse_args():
help="Return response as JSON",
action="store_true"
)
+ parser.add_argument(
+ "-det",
+ "--detail",
+ help="Fetch SI (structured register content) detail for the first search result",
+ action="store_true"
+ )
args = parser.parse_args()
@@ -214,8 +580,17 @@ def parse_args():
h.open_startpage()
companies = h.search_company()
if companies is not None:
+ detail = None
+ if args.detail and len(companies) > 0:
+ detail = h.fetch_company_detail(result_index=0)
+
if args.json:
- print(json.dumps(companies))
+ output = companies
+ if detail:
+ output[0]['detail'] = detail
+ print(json.dumps(output))
else:
for c in companies:
pr_company_info(c)
+ if detail:
+ pr_company_detail(detail)
diff --git a/test_handelsregister.py b/test_handelsregister.py
index fa1951a..9b6a717 100644
--- a/test_handelsregister.py
+++ b/test_handelsregister.py
@@ -1,12 +1,12 @@
import pytest
-from handelsregister import get_companies_in_searchresults,HandelsRegister
+from handelsregister import get_companies_in_searchresults,HandelsRegister,parse_si_detail
import argparse
def test_parse_search_result():
html = '
%s' % """"""
res = get_companies_in_searchresults(html)
assert res == [{
- 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343',
+ 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343',
'register_num': 'HRB 44343 B',
'name':'GASAG AG',
'state':'Berlin',
@@ -17,6 +17,75 @@ def test_parse_search_result():
},]
+def test_parse_si_detail():
+ """Test parsing of SI (Strukturierter Registerinhalt) XJustiz XML."""
+ si_xml = """unbekannt2026-02-25T11:21:36ZU1308test-id002RegisSTARTest1.00U1308HRB32007Strukturierter Registerinhalt114536528302871Test Company GmbH221110Leipzig003Teststraße4204109Leipzig000414533251500864MaxMustermann1984-06-25017Leipzig514543556802855ErikaMusterfrau1990-01-15017Berlin0021612009Gesellschaftsvertrag vom 18.06.2015.11:21:362026-02-252025-06-0272025-05-222015-06-18Test Company GmbH221110Leipzig003Teststraße4204109Leipzig0000664einzelvertretungsberechtigt;0115002Softwareentwicklung und IT-Beratung.25000.00EUR"""
+
+ detail = parse_si_detail(si_xml)
+
+ # Company info
+ assert detail['name'] == 'Test Company GmbH'
+ assert 'GmbH' in detail['legal_form']
+ assert detail['seat'] == 'Leipzig'
+
+ # Address
+ assert detail['address']['street'] == 'Teststraße 42'
+ assert detail['address']['postal_code'] == '04109'
+ assert detail['address']['city'] == 'Leipzig'
+
+ # Business purpose
+ assert 'Softwareentwicklung' in detail['business_purpose']
+
+ # Capital
+ assert detail['capital']['amount'] == '25000.00'
+ assert detail['capital']['currency'] == 'EUR'
+
+ # Directors
+ assert len(detail['directors']) == 1
+ assert detail['directors'][0]['name'] == 'Max Mustermann'
+ assert detail['directors'][0]['date_of_birth'] == '1984-06-25'
+
+ # Prokura
+ assert len(detail['prokura']) == 1
+ assert detail['prokura'][0]['name'] == 'Erika Musterfrau'
+
+ # Register info
+ assert detail['register_type'] == 'HRB'
+ assert detail['register_number'] == '32007'
+ assert detail['retrieval_date'] == '2026-02-25'
+ assert detail['last_entry_date'] == '2025-06-02'
+
+ # Articles of association
+ assert detail['articles_of_association_date'] == '2015-06-18'
+
+ # Register entries
+ assert len(detail['register_entries']) >= 1
+ assert 'Gesellschaftsvertrag' in detail['register_entries'][0]['text']
+
+
+def test_fetch_detail():
+ """Integration test: search for a known company and fetch its SI data."""
+ args = argparse.Namespace(debug=False, force=True, schlagwoerter='Gecko Two GmbH',
+ schlagwortOptionen='exact', json=False, detail=True)
+ h = HandelsRegister(args)
+ h.open_startpage()
+ companies = h.search_company()
+ assert companies is not None
+ assert len(companies) > 0
+
+ detail = h.fetch_company_detail(result_index=0)
+ assert detail is not None
+
+ # Basic fields that should always be present
+ assert detail.get('name') is not None
+ assert 'Gecko' in detail['name']
+ assert detail.get('seat') is not None
+ assert detail.get('address') is not None
+ assert detail.get('capital') is not None
+ assert detail.get('directors') is not None
+ assert len(detail['directors']) > 0
+
+
@pytest.mark.parametrize("company, state_id", [
("Hafen Hamburg", "Hamburg"),
("Bayerische Motoren Werke", "Bayern"),
@@ -37,7 +106,7 @@ def test_parse_search_result():
])
def test_search_by_state_company(company, state_id):
- args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False)
+ args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False, detail=False)
h = HandelsRegister(args)
h.open_startpage()
companies = h.search_company()
@@ -45,13 +114,13 @@ def test_search_by_state_company(company, state_id):
assert len(companies) > 0
def test_haus_anker_b_suffix():
- args = argparse.Namespace(debug=False, force=True, schlagwoerter='Haus-Anker Verwaltungs GmbH', schlagwortOptionen='exact', json=False)
+ args = argparse.Namespace(debug=False, force=True, schlagwoerter='Haus-Anker Verwaltungs GmbH', schlagwortOptionen='exact', json=False, detail=False)
h = HandelsRegister(args)
h.open_startpage()
companies = h.search_company()
assert companies is not None
-
+
target_company = next((c for c in companies if '138434' in c['register_num']), None)
-
+
assert target_company is not None, "Haus-Anker Verwaltungs GmbH with expected number not found"
- assert target_company['register_num'] == 'HRB 138434 B'
\ No newline at end of file
+ assert target_company['register_num'] == 'HRB 138434 B'