如何从多个JSON对象中选择一个并在python中导航其层次结构

How to select a single JSON object out of several and navigate its hierarchy in python

本文关键字:层次结构 一个 python 导航 JSON 对象 选择      更新时间:2023-09-26

我有一个包含几个javascript元素的网页。我只想访问一个名为SOURCE.pdp.propertyJSON的属性,并以python的方式访问其属性。

编辑(为了可读性)的HTML源代码版本如下;以下是我的python代码:

任何指示将非常感激!

<script type="text/javascript">
SOURCE = SOURCE || {};
SOURCE.pdp = SOURCE.pdp || {};
SOURCE.pdp.propertyJSON = {
  "neighborhood": "Westwood",
  "neighborhoodId": 7187,
  "zipCode": "90024",
  "city": "Los Angeles",
  "county": "Los Angeles",
  "countyFIPS": "06037",
  "stateCode": "CA",
  "stateName": "California",
  "type": "CONDO",
  "typeDisplay": "Condo",
  "numBedrooms": "2",
  "numBathrooms": 2,
  "numFullBathrooms": 2,
  "numBeds": 2,
  "indexSource": "Assessor",
  "isForeclosure": false,
  "isOpaqueBAL": false,
  "foreclosureStatus": "",
  "isSrpFeatured": false,
  "price": null,
  "sqft": 1321,
  "formattedBedAndBath": "2bd, 2 full ba",
  "formattedSqft": "1,321 sqft"
}
pdp_location_data = {
  "neighborhood": {
    "locationId": "87308",
    "name": "Westwood",
    "locationType": "neighborhood",
    "altId": "7187"
  },
  "state": {
    "locationId": "5",
    "name": "California",
    "locationType": "state",
    "altId": "CA"
  },
  "county": {
    "locationId": "57",
    "name": "Los Angeles County",
    "locationType": "county",
    "altId": "06037"
  },
  "city": {
    "locationId": "22637",
    "name": "Los Angeles",
    "locationType": "city",
    "altId": "4396"
  },
  "zipCode": {
    "locationId": "76090",
    "name": "90024",
    "locationType": "zipCode",
    "altName": "90024",
    "altId": "90024"
  }
};
SOURCE.pdp.isCountySupportsValuation = true;
SOURCE.pdp.isInHighDemandRegion = false;
var _SPANLONG = pdp_location_data.longitude;
var _SPANLAT = pdp_location_data.latitude;
var _CENLONG = pdp_location_data.longitude;
var _CENLAT = pdp_location_data.latitude;
</script>

小心丑陋的巨蟒!

  from bs4 import BeautifulSoup as bsoup
  import requests as rq
  url = 'https://www.SOURCE.com'
  source_code = rq.get(url).text
  soupcon = bsoup(source_code,"html.parser")
  souper = soupcon.find_all('script', {'type': 'text/javascript'})
  for line in souper:
      if format(line).find('SOURCE.pdp.propertyJSON') != -1:
            parts = format(line).split(',')
  for var in parts:
        if var.find('zipCode') != -1:
            zipCode = var.split(':')[1].strip('"')
        elif var.find('numBathrooms') != -1:
            numBathrooms = var.split(':')[1].strip('"')

正如你所看到的,我目前正在访问我想要的JS对象,通过查找类型为text/javascript的所有脚本元素,遍历它们以找到包含我想要的对象的脚本,然后通过JS分隔符','拆分整个脚本,并通过搜索它们来识别JS对象的元素。

可以使用json.loads:

将数据加载为字典
from bs4 import BeautifulSoup as bsoup
import re
from json import loads
source = """<script type="text/javascript">  SOURCE = SOURCE || {};
  SOURCE.pdp = SOURCE.pdp || {};
  SOURCE.pdp.propertyJSON = {    "neighborhood": "Westwood",    "neighborhoodId": 7187,    "zipCode": "90024",    "city": "Los Angeles",    "county": "Los Angeles",    "countyFIPS": "06037",    "stateCode": "CA",    "stateName": "California",    "type": "CONDO",    "typeDisplay": "Condo",    "numBedrooms": "2",    "numBathrooms": 2,    "numFullBathrooms": 2,    "numBeds": 2,    "indexSource": "Assessor",    "isForeclosure": false,    "isOpaqueBAL": false,    "foreclosureStatus": "",    "isSrpFeatured": false,    "price": null,    "sqft": 1321,    "formattedBedAndBath": "2bd, 2 full ba",    "formattedSqft": "1,321 sqft"  }  pdp_location_data = {    "neighborhood": {      "locationId": "87308",      "name": "Westwood",      "locationType": "neighborhood",      "altId": "7187"    },    "state": {      "locationId": "5",      "name": "California",      "locationType": "state",      "altId": "CA"    },    "county": {      "locationId": "57",      "name": "Los Angeles County",      "locationType": "county",      "altId": "06037"    },    "city": {      "locationId": "22637",      "name": "Los Angeles",      "locationType": "city",      "altId": "4396"    },    "zipCode": {      "locationId": "76090",      "name": "90024",      "locationType": "zipCode",      "altName": "90024",      "altId": "90024"    }  };
   SOURCE.pdp.isCountySupportsValuation = true;
    SOURCE.pdp.isInHighDemandRegion = false;
    var _SPANLONG = pdp_location_data.longitude;
    var _SPANLAT = pdp_location_data.latitude;
    var _CENLONG = pdp_location_data.longitude;
   var _CENLAT = pdp_location_data.latitude;  </script>"""
soup = bsoup(source,"html.parser")

json_re = re.compile("SOURCE'.pdp'.propertyJSON's+='s+('{.*'})'s+pdp_location_data")
scr = soup.find("script", text=re.compile("SOURCE.pdp.propertyJSON")).text
js_raw = json_re.search(scr).group(1)
json_dict = loads(js_raw)

这将给你:

{u'numBeds': 2, u'neighborhood': u'Westwood', u'stateName': u'California', u'numFullBathrooms': 2, u'indexSource': u'Assessor', u'countyFIPS': u'06037', u'city': u'Los Angeles', u'isSrpFeatured': False, u'type': u'CONDO', u'formattedSqft': u'1,321 sqft', u'isOpaqueBAL': False, u'price': None, u'zipCode': u'90024', u'numBedrooms': u'2', u'neighborhoodId': 7187, u'county': u'Los Angeles', u'formattedBedAndBath': u'2bd, 2 full ba', u'sqft': 1321, u'numBathrooms': 2, u'stateCode': u'CA', u'isForeclosure': False, u'typeDisplay': u'Condo', u'foreclosureStatus': u''}

如果你想要pdp_location_data json只是应用完全相同的逻辑